Skip to content

Commit 5a1c72e

Browse files
committed
Merge tag 'kvm-x86-mmu-6.10' of https://github.com/kvm-x86/linux into HEAD
KVM x86 MMU changes for 6.10: - Process TDP MMU SPTEs that are are zapped while holding mmu_lock for read after replacing REMOVED_SPTE with '0' and flushing remote TLBs, which allows vCPU tasks to repopulate the zapped region while the zapper finishes tearing down the old, defunct page tables. - Fix a longstanding, likely benign-in-practice race where KVM could fail to detect a write from kvm_mmu_track_write() to a shadowed GPTE if the GPTE is first page table being shadowed.
2 parents dee7ea4 + 226d9b8 commit 5a1c72e

File tree

2 files changed

+66
-29
lines changed

2 files changed

+66
-29
lines changed

arch/x86/kvm/mmu/mmu.c

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -831,6 +831,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
831831
gfn_t gfn;
832832

833833
kvm->arch.indirect_shadow_pages++;
834+
/*
835+
* Ensure indirect_shadow_pages is elevated prior to re-reading guest
836+
* child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight
837+
* emulated writes are visible before re-reading guest PTEs, or that
838+
* an emulated write will see the elevated count and acquire mmu_lock
839+
* to update SPTEs. Pairs with the smp_mb() in kvm_mmu_track_write().
840+
*/
841+
smp_mb();
842+
834843
gfn = sp->gfn;
835844
slots = kvm_memslots_for_spte_role(kvm, sp->role);
836845
slot = __gfn_to_memslot(slots, gfn);
@@ -5787,10 +5796,15 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
57875796
bool flush = false;
57885797

57895798
/*
5790-
* If we don't have indirect shadow pages, it means no page is
5791-
* write-protected, so we can exit simply.
5799+
* When emulating guest writes, ensure the written value is visible to
5800+
* any task that is handling page faults before checking whether or not
5801+
* KVM is shadowing a guest PTE. This ensures either KVM will create
5802+
* the correct SPTE in the page fault handler, or this task will see
5803+
* a non-zero indirect_shadow_pages. Pairs with the smp_mb() in
5804+
* account_shadowed().
57925805
*/
5793-
if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5806+
smp_mb();
5807+
if (!vcpu->kvm->arch.indirect_shadow_pages)
57945808
return;
57955809

57965810
write_lock(&vcpu->kvm->mmu_lock);

arch/x86/kvm/mmu/tdp_mmu.c

Lines changed: 49 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,31 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
530530
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
531531
}
532532

533+
static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte)
534+
{
535+
u64 *sptep = rcu_dereference(iter->sptep);
536+
537+
/*
538+
* The caller is responsible for ensuring the old SPTE is not a REMOVED
539+
* SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
540+
* and pre-checking before inserting a new SPTE is advantageous as it
541+
* avoids unnecessary work.
542+
*/
543+
WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
544+
545+
/*
546+
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
547+
* does not hold the mmu_lock. On failure, i.e. if a different logical
548+
* CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
549+
* the current value, so the caller operates on fresh data, e.g. if it
550+
* retries tdp_mmu_set_spte_atomic()
551+
*/
552+
if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
553+
return -EBUSY;
554+
555+
return 0;
556+
}
557+
533558
/*
534559
* tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
535560
* and handle the associated bookkeeping. Do not mark the page dirty
@@ -551,27 +576,13 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
551576
struct tdp_iter *iter,
552577
u64 new_spte)
553578
{
554-
u64 *sptep = rcu_dereference(iter->sptep);
555-
556-
/*
557-
* The caller is responsible for ensuring the old SPTE is not a REMOVED
558-
* SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
559-
* and pre-checking before inserting a new SPTE is advantageous as it
560-
* avoids unnecessary work.
561-
*/
562-
WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
579+
int ret;
563580

564581
lockdep_assert_held_read(&kvm->mmu_lock);
565582

566-
/*
567-
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
568-
* does not hold the mmu_lock. On failure, i.e. if a different logical
569-
* CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
570-
* the current value, so the caller operates on fresh data, e.g. if it
571-
* retries tdp_mmu_set_spte_atomic()
572-
*/
573-
if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
574-
return -EBUSY;
583+
ret = __tdp_mmu_set_spte_atomic(iter, new_spte);
584+
if (ret)
585+
return ret;
575586

576587
handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
577588
new_spte, iter->level, true);
@@ -584,13 +595,17 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
584595
{
585596
int ret;
586597

598+
lockdep_assert_held_read(&kvm->mmu_lock);
599+
587600
/*
588-
* Freeze the SPTE by setting it to a special,
589-
* non-present value. This will stop other threads from
590-
* immediately installing a present entry in its place
591-
* before the TLBs are flushed.
601+
* Freeze the SPTE by setting it to a special, non-present value. This
602+
* will stop other threads from immediately installing a present entry
603+
* in its place before the TLBs are flushed.
604+
*
605+
* Delay processing of the zapped SPTE until after TLBs are flushed and
606+
* the REMOVED_SPTE is replaced (see below).
592607
*/
593-
ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
608+
ret = __tdp_mmu_set_spte_atomic(iter, REMOVED_SPTE);
594609
if (ret)
595610
return ret;
596611

@@ -599,12 +614,20 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
599614
/*
600615
* No other thread can overwrite the removed SPTE as they must either
601616
* wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
602-
* overwrite the special removed SPTE value. No bookkeeping is needed
603-
* here since the SPTE is going from non-present to non-present. Use
604-
* the raw write helper to avoid an unnecessary check on volatile bits.
617+
* overwrite the special removed SPTE value. Use the raw write helper to
618+
* avoid an unnecessary check on volatile bits.
605619
*/
606620
__kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
607621

622+
/*
623+
* Process the zapped SPTE after flushing TLBs, and after replacing
624+
* REMOVED_SPTE with 0. This minimizes the amount of time vCPUs are
625+
* blocked by the REMOVED_SPTE and reduces contention on the child
626+
* SPTEs.
627+
*/
628+
handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
629+
0, iter->level, true);
630+
608631
return 0;
609632
}
610633

0 commit comments

Comments
 (0)