Skip to content

Commit 6e949dd

Browse files
committed
Merge branch 'kvm-tdpmmu-fixes' into kvm-master
Merge topic branch with fixes for both 5.14-rc6 and 5.15.
2 parents c5e2bf0 + ce25681 commit 6e949dd

File tree

4 files changed

+63
-15
lines changed

4 files changed

+63
-15
lines changed

Documentation/virt/kvm/locking.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@ On x86:
2525

2626
- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
2727

28-
- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is
29-
taken inside kvm->arch.mmu_lock, and cannot be taken without already
30-
holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise
31-
there's no need to take kvm->arch.tdp_mmu_pages_lock at all).
28+
- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and
29+
kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and
30+
cannot be taken without already holding kvm->arch.mmu_lock (typically with
31+
``read_lock`` for the TDP MMU, thus the need for additional spinlocks).
3232

3333
Everything else is a leaf: no other lock is taken inside the critical
3434
sections.

arch/x86/include/asm/kvm_host.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1038,6 +1038,13 @@ struct kvm_arch {
10381038
struct list_head lpage_disallowed_mmu_pages;
10391039
struct kvm_page_track_notifier_node mmu_sp_tracker;
10401040
struct kvm_page_track_notifier_head track_notifier_head;
1041+
/*
1042+
* Protects marking pages unsync during page faults, as TDP MMU page
1043+
* faults only take mmu_lock for read. For simplicity, the unsync
1044+
* pages lock is always taken when marking pages unsync regardless of
1045+
* whether mmu_lock is held for read or write.
1046+
*/
1047+
spinlock_t mmu_unsync_pages_lock;
10411048

10421049
struct list_head assigned_dev_head;
10431050
struct iommu_domain *iommu_domain;

arch/x86/kvm/mmu/mmu.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2535,6 +2535,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
25352535
int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
25362536
{
25372537
struct kvm_mmu_page *sp;
2538+
bool locked = false;
25382539

25392540
/*
25402541
* Force write-protection if the page is being tracked. Note, the page
@@ -2557,9 +2558,34 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
25572558
if (sp->unsync)
25582559
continue;
25592560

2561+
/*
2562+
* TDP MMU page faults require an additional spinlock as they
2563+
* run with mmu_lock held for read, not write, and the unsync
2564+
* logic is not thread safe. Take the spinklock regardless of
2565+
* the MMU type to avoid extra conditionals/parameters, there's
2566+
* no meaningful penalty if mmu_lock is held for write.
2567+
*/
2568+
if (!locked) {
2569+
locked = true;
2570+
spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
2571+
2572+
/*
2573+
* Recheck after taking the spinlock, a different vCPU
2574+
* may have since marked the page unsync. A false
2575+
* positive on the unprotected check above is not
2576+
* possible as clearing sp->unsync _must_ hold mmu_lock
2577+
* for write, i.e. unsync cannot transition from 0->1
2578+
* while this CPU holds mmu_lock for read (or write).
2579+
*/
2580+
if (READ_ONCE(sp->unsync))
2581+
continue;
2582+
}
2583+
25602584
WARN_ON(sp->role.level != PG_LEVEL_4K);
25612585
kvm_unsync_page(vcpu, sp);
25622586
}
2587+
if (locked)
2588+
spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
25632589

25642590
/*
25652591
* We need to ensure that the marking of unsync pages is visible
@@ -5537,6 +5563,8 @@ void kvm_mmu_init_vm(struct kvm *kvm)
55375563
{
55385564
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
55395565

5566+
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
5567+
55405568
if (!kvm_mmu_init_tdp_mmu(kvm))
55415569
/*
55425570
* No smp_load/store wrappers needed here as we are in

arch/x86/kvm/mmu/tdp_mmu.c

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
4343
if (!kvm->arch.tdp_mmu_enabled)
4444
return;
4545

46+
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
4647
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
4748

4849
/*
@@ -81,8 +82,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
8182
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
8283
bool shared)
8384
{
84-
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
85-
8685
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
8786

8887
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
@@ -94,7 +93,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
9493
list_del_rcu(&root->link);
9594
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
9695

97-
zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
96+
zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
9897

9998
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
10099
}
@@ -724,13 +723,29 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
724723
gfn_t start, gfn_t end, bool can_yield, bool flush,
725724
bool shared)
726725
{
726+
gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
727+
bool zap_all = (start == 0 && end >= max_gfn_host);
727728
struct tdp_iter iter;
728729

730+
/*
731+
* No need to try to step down in the iterator when zapping all SPTEs,
732+
* zapping the top-level non-leaf SPTEs will recurse on their children.
733+
*/
734+
int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
735+
736+
/*
737+
* Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
738+
* hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
739+
* and so KVM will never install a SPTE for such addresses.
740+
*/
741+
end = min(end, max_gfn_host);
742+
729743
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
730744

731745
rcu_read_lock();
732746

733-
tdp_root_for_each_pte(iter, root, start, end) {
747+
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
748+
min_level, start, end) {
734749
retry:
735750
if (can_yield &&
736751
tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
@@ -744,9 +759,10 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
744759
/*
745760
* If this is a non-last-level SPTE that covers a larger range
746761
* than should be zapped, continue, and zap the mappings at a
747-
* lower level.
762+
* lower level, except when zapping all SPTEs.
748763
*/
749-
if ((iter.gfn < start ||
764+
if (!zap_all &&
765+
(iter.gfn < start ||
750766
iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
751767
!is_last_spte(iter.old_spte, iter.level))
752768
continue;
@@ -794,12 +810,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
794810

795811
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
796812
{
797-
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
798813
bool flush = false;
799814
int i;
800815

801816
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
802-
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
817+
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
803818
flush, false);
804819

805820
if (flush)
@@ -838,7 +853,6 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
838853
*/
839854
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
840855
{
841-
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
842856
struct kvm_mmu_page *next_root;
843857
struct kvm_mmu_page *root;
844858
bool flush = false;
@@ -854,8 +868,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
854868

855869
rcu_read_unlock();
856870

857-
flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
858-
true);
871+
flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
859872

860873
/*
861874
* Put the reference acquired in

0 commit comments

Comments
 (0)