Skip to content

Commit 1aa9b95

Browse files
Junaid ShahidKAGA-KOKO
authored andcommitted
kvm: x86: mmu: Recovery of shattered NX large pages
The page table pages corresponding to broken down large pages are zapped in FIFO order, so that the large page can potentially be recovered, if it is not longer being used for execution. This removes the performance penalty for walking deeper EPT page tables. By default, one large page will last about one hour once the guest reaches a steady state. Signed-off-by: Junaid Shahid <[email protected]> Signed-off-by: Paolo Bonzini <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]>
1 parent c57c804 commit 1aa9b95

File tree

6 files changed

+182
-0
lines changed

6 files changed

+182
-0
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2068,6 +2068,12 @@
20682068
If the software workaround is enabled for the host,
20692069
guests do need not to enable it for nested guests.
20702070

2071+
kvm.nx_huge_pages_recovery_ratio=
2072+
[KVM] Controls how many 4KiB pages are periodically zapped
2073+
back to huge pages. 0 disables the recovery, otherwise if
2074+
the value is N KVM will zap 1/Nth of the 4KiB pages every
2075+
minute. The default is 60.
2076+
20712077
kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
20722078
Default is 1 (enabled)
20732079

arch/x86/include/asm/kvm_host.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,8 @@ struct kvm_rmap_head {
312312
struct kvm_mmu_page {
313313
struct list_head link;
314314
struct hlist_node hash_link;
315+
struct list_head lpage_disallowed_link;
316+
315317
bool unsync;
316318
u8 mmu_valid_gen;
317319
bool mmio_cached;
@@ -860,6 +862,7 @@ struct kvm_arch {
860862
*/
861863
struct list_head active_mmu_pages;
862864
struct list_head zapped_obsolete_pages;
865+
struct list_head lpage_disallowed_mmu_pages;
863866
struct kvm_page_track_notifier_node mmu_sp_tracker;
864867
struct kvm_page_track_notifier_head track_notifier_head;
865868

@@ -934,6 +937,7 @@ struct kvm_arch {
934937
bool exception_payload_enabled;
935938

936939
struct kvm_pmu_event_filter *pmu_event_filter;
940+
struct task_struct *nx_lpage_recovery_thread;
937941
};
938942

939943
struct kvm_vm_stat {

arch/x86/kvm/mmu.c

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <linux/uaccess.h>
3838
#include <linux/hash.h>
3939
#include <linux/kern_levels.h>
40+
#include <linux/kthread.h>
4041

4142
#include <asm/page.h>
4243
#include <asm/pat.h>
@@ -50,16 +51,26 @@
5051
extern bool itlb_multihit_kvm_mitigation;
5152

5253
static int __read_mostly nx_huge_pages = -1;
54+
static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
5355

5456
static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
57+
static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
5558

5659
static struct kernel_param_ops nx_huge_pages_ops = {
5760
.set = set_nx_huge_pages,
5861
.get = param_get_bool,
5962
};
6063

64+
static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
65+
.set = set_nx_huge_pages_recovery_ratio,
66+
.get = param_get_uint,
67+
};
68+
6169
module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
6270
__MODULE_PARM_TYPE(nx_huge_pages, "bool");
71+
module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
72+
&nx_huge_pages_recovery_ratio, 0644);
73+
__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
6374

6475
/*
6576
* When setting this variable to true it enables Two-Dimensional-Paging
@@ -1215,6 +1226,8 @@ static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
12151226
return;
12161227

12171228
++kvm->stat.nx_lpage_splits;
1229+
list_add_tail(&sp->lpage_disallowed_link,
1230+
&kvm->arch.lpage_disallowed_mmu_pages);
12181231
sp->lpage_disallowed = true;
12191232
}
12201233

@@ -1239,6 +1252,7 @@ static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
12391252
{
12401253
--kvm->stat.nx_lpage_splits;
12411254
sp->lpage_disallowed = false;
1255+
list_del(&sp->lpage_disallowed_link);
12421256
}
12431257

12441258
static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
@@ -6274,6 +6288,8 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
62746288
idx = srcu_read_lock(&kvm->srcu);
62756289
kvm_mmu_zap_all_fast(kvm);
62766290
srcu_read_unlock(&kvm->srcu, idx);
6291+
6292+
wake_up_process(kvm->arch.nx_lpage_recovery_thread);
62776293
}
62786294
mutex_unlock(&kvm_lock);
62796295
}
@@ -6367,3 +6383,116 @@ void kvm_mmu_module_exit(void)
63676383
unregister_shrinker(&mmu_shrinker);
63686384
mmu_audit_disable();
63696385
}
6386+
6387+
static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
6388+
{
6389+
unsigned int old_val;
6390+
int err;
6391+
6392+
old_val = nx_huge_pages_recovery_ratio;
6393+
err = param_set_uint(val, kp);
6394+
if (err)
6395+
return err;
6396+
6397+
if (READ_ONCE(nx_huge_pages) &&
6398+
!old_val && nx_huge_pages_recovery_ratio) {
6399+
struct kvm *kvm;
6400+
6401+
mutex_lock(&kvm_lock);
6402+
6403+
list_for_each_entry(kvm, &vm_list, vm_list)
6404+
wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6405+
6406+
mutex_unlock(&kvm_lock);
6407+
}
6408+
6409+
return err;
6410+
}
6411+
6412+
static void kvm_recover_nx_lpages(struct kvm *kvm)
6413+
{
6414+
int rcu_idx;
6415+
struct kvm_mmu_page *sp;
6416+
unsigned int ratio;
6417+
LIST_HEAD(invalid_list);
6418+
ulong to_zap;
6419+
6420+
rcu_idx = srcu_read_lock(&kvm->srcu);
6421+
spin_lock(&kvm->mmu_lock);
6422+
6423+
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6424+
to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
6425+
while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
6426+
/*
6427+
* We use a separate list instead of just using active_mmu_pages
6428+
* because the number of lpage_disallowed pages is expected to
6429+
* be relatively small compared to the total.
6430+
*/
6431+
sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6432+
struct kvm_mmu_page,
6433+
lpage_disallowed_link);
6434+
WARN_ON_ONCE(!sp->lpage_disallowed);
6435+
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6436+
WARN_ON_ONCE(sp->lpage_disallowed);
6437+
6438+
if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
6439+
kvm_mmu_commit_zap_page(kvm, &invalid_list);
6440+
if (to_zap)
6441+
cond_resched_lock(&kvm->mmu_lock);
6442+
}
6443+
}
6444+
6445+
spin_unlock(&kvm->mmu_lock);
6446+
srcu_read_unlock(&kvm->srcu, rcu_idx);
6447+
}
6448+
6449+
static long get_nx_lpage_recovery_timeout(u64 start_time)
6450+
{
6451+
return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
6452+
? start_time + 60 * HZ - get_jiffies_64()
6453+
: MAX_SCHEDULE_TIMEOUT;
6454+
}
6455+
6456+
static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6457+
{
6458+
u64 start_time;
6459+
long remaining_time;
6460+
6461+
while (true) {
6462+
start_time = get_jiffies_64();
6463+
remaining_time = get_nx_lpage_recovery_timeout(start_time);
6464+
6465+
set_current_state(TASK_INTERRUPTIBLE);
6466+
while (!kthread_should_stop() && remaining_time > 0) {
6467+
schedule_timeout(remaining_time);
6468+
remaining_time = get_nx_lpage_recovery_timeout(start_time);
6469+
set_current_state(TASK_INTERRUPTIBLE);
6470+
}
6471+
6472+
set_current_state(TASK_RUNNING);
6473+
6474+
if (kthread_should_stop())
6475+
return 0;
6476+
6477+
kvm_recover_nx_lpages(kvm);
6478+
}
6479+
}
6480+
6481+
int kvm_mmu_post_init_vm(struct kvm *kvm)
6482+
{
6483+
int err;
6484+
6485+
err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6486+
"kvm-nx-lpage-recovery",
6487+
&kvm->arch.nx_lpage_recovery_thread);
6488+
if (!err)
6489+
kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6490+
6491+
return err;
6492+
}
6493+
6494+
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6495+
{
6496+
if (kvm->arch.nx_lpage_recovery_thread)
6497+
kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6498+
}

arch/x86/kvm/mmu.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,4 +210,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
210210
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
211211
struct kvm_memory_slot *slot, u64 gfn);
212212
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
213+
214+
int kvm_mmu_post_init_vm(struct kvm *kvm);
215+
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
216+
213217
#endif

arch/x86/kvm/x86.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9456,6 +9456,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
94569456
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
94579457
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
94589458
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
9459+
INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
94599460
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
94609461
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
94619462

@@ -9484,6 +9485,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
94849485
return kvm_x86_ops->vm_init(kvm);
94859486
}
94869487

9488+
int kvm_arch_post_init_vm(struct kvm *kvm)
9489+
{
9490+
return kvm_mmu_post_init_vm(kvm);
9491+
}
9492+
94879493
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
94889494
{
94899495
vcpu_load(vcpu);
@@ -9585,6 +9591,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
95859591
}
95869592
EXPORT_SYMBOL_GPL(x86_set_memory_region);
95879593

9594+
void kvm_arch_pre_destroy_vm(struct kvm *kvm)
9595+
{
9596+
kvm_mmu_pre_destroy_vm(kvm);
9597+
}
9598+
95889599
void kvm_arch_destroy_vm(struct kvm *kvm)
95899600
{
95909601
if (current->mm == kvm->mm) {

virt/kvm/kvm_main.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -626,6 +626,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
626626
return 0;
627627
}
628628

629+
/*
630+
* Called after the VM is otherwise initialized, but just before adding it to
631+
* the vm_list.
632+
*/
633+
int __weak kvm_arch_post_init_vm(struct kvm *kvm)
634+
{
635+
return 0;
636+
}
637+
638+
/*
639+
* Called just after removing the VM from the vm_list, but before doing any
640+
* other destruction.
641+
*/
642+
void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
643+
{
644+
}
645+
629646
static struct kvm *kvm_create_vm(unsigned long type)
630647
{
631648
struct kvm *kvm = kvm_arch_alloc_vm();
@@ -682,6 +699,10 @@ static struct kvm *kvm_create_vm(unsigned long type)
682699
goto out_err_no_irq_srcu;
683700

684701
r = kvm_init_mmu_notifier(kvm);
702+
if (r)
703+
goto out_err_no_mmu_notifier;
704+
705+
r = kvm_arch_post_init_vm(kvm);
685706
if (r)
686707
goto out_err;
687708

@@ -694,6 +715,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
694715
return kvm;
695716

696717
out_err:
718+
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
719+
if (kvm->mmu_notifier.ops)
720+
mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
721+
#endif
722+
out_err_no_mmu_notifier:
697723
cleanup_srcu_struct(&kvm->irq_srcu);
698724
out_err_no_irq_srcu:
699725
cleanup_srcu_struct(&kvm->srcu);
@@ -738,6 +764,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
738764
mutex_lock(&kvm_lock);
739765
list_del(&kvm->vm_list);
740766
mutex_unlock(&kvm_lock);
767+
kvm_arch_pre_destroy_vm(kvm);
768+
741769
kvm_free_irq_routing(kvm);
742770
for (i = 0; i < KVM_NR_BUSES; i++) {
743771
struct kvm_io_bus *bus = kvm_get_bus(kvm, i);

0 commit comments

Comments
 (0)