Skip to content

Commit 5b0d0d8

Browse files
committed
Merge tag 'kvm-x86-mmu-6.18' of https://github.com/kvm-x86/linux into HEAD
KVM x86 MMU changes for 6.18 - Recover possible NX huge pages within the TDP MMU under read lock to reduce guest jitter when restoring NX huge pages. - Return -EAGAIN during prefault if userspace concurrently deletes/moves the relevant memslot to fix an issue where prefaulting could deadlock with the memslot update. - Don't retry in TDX's anti-zero-step mitigation if the target memslot is invalid, i.e. is being deleted or moved, to fix a deadlock scenario similar to the aforementioned prefaulting case.
2 parents 99cab80 + 2bc2694 commit 5b0d0d8

File tree

7 files changed

+190
-80
lines changed

7 files changed

+190
-80
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,6 +1348,30 @@ enum kvm_apicv_inhibit {
13481348
__APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \
13491349
__APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG)
13501350

1351+
struct kvm_possible_nx_huge_pages {
1352+
/*
1353+
* A list of kvm_mmu_page structs that, if zapped, could possibly be
1354+
* replaced by an NX huge page. A shadow page is on this list if its
1355+
* existence disallows an NX huge page (nx_huge_page_disallowed is set)
1356+
* and there are no other conditions that prevent a huge page, e.g.
1357+
* the backing host page is huge, dirtly logging is not enabled for its
1358+
* memslot, etc... Note, zapping shadow pages on this list doesn't
1359+
* guarantee an NX huge page will be created in its stead, e.g. if the
1360+
* guest attempts to execute from the region then KVM obviously can't
1361+
* create an NX huge page (without hanging the guest).
1362+
*/
1363+
struct list_head pages;
1364+
u64 nr_pages;
1365+
};
1366+
1367+
enum kvm_mmu_type {
1368+
KVM_SHADOW_MMU,
1369+
#ifdef CONFIG_X86_64
1370+
KVM_TDP_MMU,
1371+
#endif
1372+
KVM_NR_MMU_TYPES,
1373+
};
1374+
13511375
struct kvm_arch {
13521376
unsigned long n_used_mmu_pages;
13531377
unsigned long n_requested_mmu_pages;
@@ -1360,18 +1384,7 @@ struct kvm_arch {
13601384
bool pre_fault_allowed;
13611385
struct hlist_head *mmu_page_hash;
13621386
struct list_head active_mmu_pages;
1363-
/*
1364-
* A list of kvm_mmu_page structs that, if zapped, could possibly be
1365-
* replaced by an NX huge page. A shadow page is on this list if its
1366-
* existence disallows an NX huge page (nx_huge_page_disallowed is set)
1367-
* and there are no other conditions that prevent a huge page, e.g.
1368-
* the backing host page is huge, dirtly logging is not enabled for its
1369-
* memslot, etc... Note, zapping shadow pages on this list doesn't
1370-
* guarantee an NX huge page will be created in its stead, e.g. if the
1371-
* guest attempts to execute from the region then KVM obviously can't
1372-
* create an NX huge page (without hanging the guest).
1373-
*/
1374-
struct list_head possible_nx_huge_pages;
1387+
struct kvm_possible_nx_huge_pages possible_nx_huge_pages[KVM_NR_MMU_TYPES];
13751388
#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
13761389
struct kvm_page_track_notifier_head track_notifier_head;
13771390
#endif
@@ -1526,7 +1539,7 @@ struct kvm_arch {
15261539
* is held in read mode:
15271540
* - tdp_mmu_roots (above)
15281541
* - the link field of kvm_mmu_page structs used by the TDP MMU
1529-
* - possible_nx_huge_pages;
1542+
* - possible_nx_huge_pages[KVM_TDP_MMU];
15301543
* - the possible_nx_huge_page_link field of kvm_mmu_page structs used
15311544
* by the TDP MMU
15321545
* Because the lock is only taken within the MMU lock, strictly

arch/x86/kvm/mmu/mmu.c

Lines changed: 107 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -776,7 +776,8 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
776776
kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
777777
}
778778

779-
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
779+
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
780+
enum kvm_mmu_type mmu_type)
780781
{
781782
/*
782783
* If it's possible to replace the shadow page with an NX huge page,
@@ -790,8 +791,9 @@ void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
790791
return;
791792

792793
++kvm->stat.nx_lpage_splits;
794+
++kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
793795
list_add_tail(&sp->possible_nx_huge_page_link,
794-
&kvm->arch.possible_nx_huge_pages);
796+
&kvm->arch.possible_nx_huge_pages[mmu_type].pages);
795797
}
796798

797799
static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -800,7 +802,7 @@ static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
800802
sp->nx_huge_page_disallowed = true;
801803

802804
if (nx_huge_page_possible)
803-
track_possible_nx_huge_page(kvm, sp);
805+
track_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
804806
}
805807

806808
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -819,20 +821,22 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
819821
kvm_mmu_gfn_allow_lpage(slot, gfn);
820822
}
821823

822-
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
824+
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
825+
enum kvm_mmu_type mmu_type)
823826
{
824827
if (list_empty(&sp->possible_nx_huge_page_link))
825828
return;
826829

827830
--kvm->stat.nx_lpage_splits;
831+
--kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
828832
list_del_init(&sp->possible_nx_huge_page_link);
829833
}
830834

831835
static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
832836
{
833837
sp->nx_huge_page_disallowed = false;
834838

835-
untrack_possible_nx_huge_page(kvm, sp);
839+
untrack_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
836840
}
837841

838842
static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
@@ -4663,10 +4667,16 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
46634667
/*
46644668
* Retry the page fault if the gfn hit a memslot that is being deleted
46654669
* or moved. This ensures any existing SPTEs for the old memslot will
4666-
* be zapped before KVM inserts a new MMIO SPTE for the gfn.
4670+
* be zapped before KVM inserts a new MMIO SPTE for the gfn. Punt the
4671+
* error to userspace if this is a prefault, as KVM's prefaulting ABI
4672+
* doesn't provide the same forward progress guarantees as KVM_RUN.
46674673
*/
4668-
if (slot->flags & KVM_MEMSLOT_INVALID)
4674+
if (slot->flags & KVM_MEMSLOT_INVALID) {
4675+
if (fault->prefetch)
4676+
return -EAGAIN;
4677+
46694678
return RET_PF_RETRY;
4679+
}
46704680

46714681
if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
46724682
/*
@@ -6751,11 +6761,12 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
67516761

67526762
int kvm_mmu_init_vm(struct kvm *kvm)
67536763
{
6754-
int r;
6764+
int r, i;
67556765

67566766
kvm->arch.shadow_mmio_value = shadow_mmio_value;
67576767
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
6758-
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
6768+
for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
6769+
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages[i].pages);
67596770
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
67606771

67616772
if (tdp_mmu_enabled) {
@@ -7596,19 +7607,64 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
75967607
return err;
75977608
}
75987609

7599-
static void kvm_recover_nx_huge_pages(struct kvm *kvm)
7610+
static unsigned long nx_huge_pages_to_zap(struct kvm *kvm,
7611+
enum kvm_mmu_type mmu_type)
7612+
{
7613+
unsigned long pages = READ_ONCE(kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages);
7614+
unsigned int ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
7615+
7616+
return ratio ? DIV_ROUND_UP(pages, ratio) : 0;
7617+
}
7618+
7619+
static bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm,
7620+
struct kvm_mmu_page *sp)
76007621
{
7601-
unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
76027622
struct kvm_memory_slot *slot;
7603-
int rcu_idx;
7623+
7624+
/*
7625+
* Skip the memslot lookup if dirty tracking can't possibly be enabled,
7626+
* as memslot lookups are relatively expensive.
7627+
*
7628+
* If a memslot update is in progress, reading an incorrect value of
7629+
* kvm->nr_memslots_dirty_logging is not a problem: if it is becoming
7630+
* zero, KVM will do an unnecessary memslot lookup; if it is becoming
7631+
* nonzero, the page will be zapped unnecessarily. Either way, this
7632+
* only affects efficiency in racy situations, and not correctness.
7633+
*/
7634+
if (!atomic_read(&kvm->nr_memslots_dirty_logging))
7635+
return false;
7636+
7637+
slot = __gfn_to_memslot(kvm_memslots_for_spte_role(kvm, sp->role), sp->gfn);
7638+
if (WARN_ON_ONCE(!slot))
7639+
return false;
7640+
7641+
return kvm_slot_dirty_track_enabled(slot);
7642+
}
7643+
7644+
static void kvm_recover_nx_huge_pages(struct kvm *kvm,
7645+
const enum kvm_mmu_type mmu_type)
7646+
{
7647+
#ifdef CONFIG_X86_64
7648+
const bool is_tdp_mmu = mmu_type == KVM_TDP_MMU;
7649+
spinlock_t *tdp_mmu_pages_lock = &kvm->arch.tdp_mmu_pages_lock;
7650+
#else
7651+
const bool is_tdp_mmu = false;
7652+
spinlock_t *tdp_mmu_pages_lock = NULL;
7653+
#endif
7654+
unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type);
7655+
struct list_head *nx_huge_pages;
76047656
struct kvm_mmu_page *sp;
7605-
unsigned int ratio;
76067657
LIST_HEAD(invalid_list);
76077658
bool flush = false;
7608-
ulong to_zap;
7659+
int rcu_idx;
7660+
7661+
nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages;
76097662

76107663
rcu_idx = srcu_read_lock(&kvm->srcu);
7611-
write_lock(&kvm->mmu_lock);
7664+
if (is_tdp_mmu)
7665+
read_lock(&kvm->mmu_lock);
7666+
else
7667+
write_lock(&kvm->mmu_lock);
76127668

76137669
/*
76147670
* Zapping TDP MMU shadow pages, including the remote TLB flush, must
@@ -7617,11 +7673,15 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
76177673
*/
76187674
rcu_read_lock();
76197675

7620-
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
7621-
to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
76227676
for ( ; to_zap; --to_zap) {
7623-
if (list_empty(&kvm->arch.possible_nx_huge_pages))
7677+
if (is_tdp_mmu)
7678+
spin_lock(tdp_mmu_pages_lock);
7679+
7680+
if (list_empty(nx_huge_pages)) {
7681+
if (is_tdp_mmu)
7682+
spin_unlock(tdp_mmu_pages_lock);
76247683
break;
7684+
}
76257685

76267686
/*
76277687
* We use a separate list instead of just using active_mmu_pages
@@ -7630,64 +7690,55 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
76307690
* the total number of shadow pages. And because the TDP MMU
76317691
* doesn't use active_mmu_pages.
76327692
*/
7633-
sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
7693+
sp = list_first_entry(nx_huge_pages,
76347694
struct kvm_mmu_page,
76357695
possible_nx_huge_page_link);
76367696
WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
76377697
WARN_ON_ONCE(!sp->role.direct);
76387698

7699+
unaccount_nx_huge_page(kvm, sp);
7700+
7701+
if (is_tdp_mmu)
7702+
spin_unlock(tdp_mmu_pages_lock);
7703+
76397704
/*
7640-
* Unaccount and do not attempt to recover any NX Huge Pages
7641-
* that are being dirty tracked, as they would just be faulted
7642-
* back in as 4KiB pages. The NX Huge Pages in this slot will be
7643-
* recovered, along with all the other huge pages in the slot,
7644-
* when dirty logging is disabled.
7645-
*
7646-
* Since gfn_to_memslot() is relatively expensive, it helps to
7647-
* skip it if it the test cannot possibly return true. On the
7648-
* other hand, if any memslot has logging enabled, chances are
7649-
* good that all of them do, in which case unaccount_nx_huge_page()
7650-
* is much cheaper than zapping the page.
7651-
*
7652-
* If a memslot update is in progress, reading an incorrect value
7653-
* of kvm->nr_memslots_dirty_logging is not a problem: if it is
7654-
* becoming zero, gfn_to_memslot() will be done unnecessarily; if
7655-
* it is becoming nonzero, the page will be zapped unnecessarily.
7656-
* Either way, this only affects efficiency in racy situations,
7657-
* and not correctness.
7705+
* Do not attempt to recover any NX Huge Pages that are being
7706+
* dirty tracked, as they would just be faulted back in as 4KiB
7707+
* pages. The NX Huge Pages in this slot will be recovered,
7708+
* along with all the other huge pages in the slot, when dirty
7709+
* logging is disabled.
76587710
*/
7659-
slot = NULL;
7660-
if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
7661-
struct kvm_memslots *slots;
7711+
if (!kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
7712+
if (is_tdp_mmu)
7713+
flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp);
7714+
else
7715+
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
76627716

7663-
slots = kvm_memslots_for_spte_role(kvm, sp->role);
7664-
slot = __gfn_to_memslot(slots, sp->gfn);
7665-
WARN_ON_ONCE(!slot);
76667717
}
76677718

7668-
if (slot && kvm_slot_dirty_track_enabled(slot))
7669-
unaccount_nx_huge_page(kvm, sp);
7670-
else if (is_tdp_mmu_page(sp))
7671-
flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
7672-
else
7673-
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
76747719
WARN_ON_ONCE(sp->nx_huge_page_disallowed);
76757720

76767721
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
76777722
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
76787723
rcu_read_unlock();
76797724

7680-
cond_resched_rwlock_write(&kvm->mmu_lock);
7681-
flush = false;
7725+
if (is_tdp_mmu)
7726+
cond_resched_rwlock_read(&kvm->mmu_lock);
7727+
else
7728+
cond_resched_rwlock_write(&kvm->mmu_lock);
76827729

7730+
flush = false;
76837731
rcu_read_lock();
76847732
}
76857733
}
76867734
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
76877735

76887736
rcu_read_unlock();
76897737

7690-
write_unlock(&kvm->mmu_lock);
7738+
if (is_tdp_mmu)
7739+
read_unlock(&kvm->mmu_lock);
7740+
else
7741+
write_unlock(&kvm->mmu_lock);
76917742
srcu_read_unlock(&kvm->srcu, rcu_idx);
76927743
}
76937744

@@ -7698,9 +7749,10 @@ static void kvm_nx_huge_page_recovery_worker_kill(void *data)
76987749
static bool kvm_nx_huge_page_recovery_worker(void *data)
76997750
{
77007751
struct kvm *kvm = data;
7752+
long remaining_time;
77017753
bool enabled;
77027754
uint period;
7703-
long remaining_time;
7755+
int i;
77047756

77057757
enabled = calc_nx_huge_pages_recovery_period(&period);
77067758
if (!enabled)
@@ -7715,7 +7767,8 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
77157767
}
77167768

77177769
__set_current_state(TASK_RUNNING);
7718-
kvm_recover_nx_huge_pages(kvm);
7770+
for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
7771+
kvm_recover_nx_huge_pages(kvm, i);
77197772
kvm->arch.nx_huge_page_last = get_jiffies_64();
77207773
return true;
77217774
}

arch/x86/kvm/mmu/mmu_internal.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
416416
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
417417
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
418418

419-
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
420-
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
419+
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
420+
enum kvm_mmu_type mmu_type);
421+
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
422+
enum kvm_mmu_type mmu_type);
421423

422424
#endif /* __KVM_X86_MMU_INTERNAL_H */

0 commit comments

Comments
 (0)