Skip to content

Commit 55f50b2

Browse files
committed
Merge branch 'kvm-memslot-zap-quirk' into HEAD
Today whenever a memslot is moved or deleted, KVM invalidates the entire page tables and generates fresh ones based on the new memslot layout. This behavior traditionally was kept because of a bug which was never fully investigated and caused VM instability with assigned GeForce GPUs. It generally does not have a huge overhead, because the old MMU is able to reuse cached page tables and the new one is more scalabale and can resolve EPT violations/nested page faults in parallel, but it has worse performance if the guest frequently deletes and adds small memslots, and it's entirely not viable for TDX. This is because TDX requires re-accepting of private pages after page dropping. For non-TDX VMs, this series therefore introduces the KVM_X86_QUIRK_SLOT_ZAP_ALL quirk, enabling users to control the behavior of memslot zapping when a memslot is moved/deleted. The quirk is turned on by default, leading to the zapping of all SPTEs when a memslot is moved/deleted; users however have the option to turn off the quirk, which limits the zapping only to those SPTEs hat lie within the range of memslot being moved/deleted. Signed-off-by: Paolo Bonzini <[email protected]>
2 parents 356dab4 + 61de4c3 commit 55f50b2

File tree

7 files changed

+101
-13
lines changed

7 files changed

+101
-13
lines changed

Documentation/virt/kvm/api.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8082,6 +8082,14 @@ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By default, KVM emulates MONITOR/MWAIT (if
80828082
guest CPUID on writes to MISC_ENABLE if
80838083
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is
80848084
disabled.
8085+
8086+
KVM_X86_QUIRK_SLOT_ZAP_ALL By default, KVM invalidates all SPTEs in
8087+
fast way for memslot deletion when VM type
8088+
is KVM_X86_DEFAULT_VM.
8089+
When this quirk is disabled or when VM type
8090+
is other than KVM_X86_DEFAULT_VM, KVM zaps
8091+
only leaf SPTEs that are within the range of
8092+
the memslot being deleted.
80858093
=================================== ============================================
80868094

80878095
7.32 KVM_CAP_MAX_VCPU_ID

arch/x86/include/asm/kvm_host.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2345,7 +2345,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
23452345
KVM_X86_QUIRK_OUT_7E_INC_RIP | \
23462346
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
23472347
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
2348-
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
2348+
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
2349+
KVM_X86_QUIRK_SLOT_ZAP_ALL)
23492350

23502351
/*
23512352
* KVM previously used a u32 field in kvm_run to indicate the hypercall was

arch/x86/include/uapi/asm/kvm.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,7 @@ struct kvm_sync_regs {
439439
#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
440440
#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
441441
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
442+
#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
442443

443444
#define KVM_STATE_NESTED_FORMAT_VMX 0
444445
#define KVM_STATE_NESTED_FORMAT_SVM 1

arch/x86/kvm/mmu/mmu.c

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6999,10 +6999,50 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
69996999
kvm_mmu_zap_all(kvm);
70007000
}
70017001

7002+
/*
7003+
* Zapping leaf SPTEs with memslot range when a memslot is moved/deleted.
7004+
*
7005+
* Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
7006+
* case scenario we'll have unused shadow pages lying around until they
7007+
* are recycled due to age or when the VM is destroyed.
7008+
*/
7009+
static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot)
7010+
{
7011+
struct kvm_gfn_range range = {
7012+
.slot = slot,
7013+
.start = slot->base_gfn,
7014+
.end = slot->base_gfn + slot->npages,
7015+
.may_block = true,
7016+
};
7017+
bool flush = false;
7018+
7019+
write_lock(&kvm->mmu_lock);
7020+
7021+
if (kvm_memslots_have_rmaps(kvm))
7022+
flush = kvm_handle_gfn_range(kvm, &range, kvm_zap_rmap);
7023+
7024+
if (tdp_mmu_enabled)
7025+
flush = kvm_tdp_mmu_unmap_gfn_range(kvm, &range, flush);
7026+
7027+
if (flush)
7028+
kvm_flush_remote_tlbs_memslot(kvm, slot);
7029+
7030+
write_unlock(&kvm->mmu_lock);
7031+
}
7032+
7033+
static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm)
7034+
{
7035+
return kvm->arch.vm_type == KVM_X86_DEFAULT_VM &&
7036+
kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL);
7037+
}
7038+
70027039
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
70037040
struct kvm_memory_slot *slot)
70047041
{
7005-
kvm_mmu_zap_all_fast(kvm);
7042+
if (kvm_memslot_flush_zap_all(kvm))
7043+
kvm_mmu_zap_all_fast(kvm);
7044+
else
7045+
kvm_mmu_zap_memslot_leafs(kvm, slot);
70067046
}
70077047

70087048
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)

tools/testing/selftests/kvm/memslot_modification_stress_test.c

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ struct test_params {
7979
useconds_t delay;
8080
uint64_t nr_iterations;
8181
bool partition_vcpu_memory_access;
82+
bool disable_slot_zap_quirk;
8283
};
8384

8485
static void run_test(enum vm_guest_mode mode, void *arg)
@@ -89,6 +90,13 @@ static void run_test(enum vm_guest_mode mode, void *arg)
8990
vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
9091
VM_MEM_SRC_ANONYMOUS,
9192
p->partition_vcpu_memory_access);
93+
#ifdef __x86_64__
94+
if (p->disable_slot_zap_quirk)
95+
vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
96+
97+
pr_info("Memslot zap quirk %s\n", p->disable_slot_zap_quirk ?
98+
"disabled" : "enabled");
99+
#endif
92100

93101
pr_info("Finished creating vCPUs\n");
94102

@@ -107,11 +115,12 @@ static void run_test(enum vm_guest_mode mode, void *arg)
107115
static void help(char *name)
108116
{
109117
puts("");
110-
printf("usage: %s [-h] [-m mode] [-d delay_usec]\n"
118+
printf("usage: %s [-h] [-m mode] [-d delay_usec] [-q]\n"
111119
" [-b memory] [-v vcpus] [-o] [-i iterations]\n", name);
112120
guest_modes_help();
113121
printf(" -d: add a delay between each iteration of adding and\n"
114122
" deleting a memslot in usec.\n");
123+
printf(" -q: Disable memslot zap quirk.\n");
115124
printf(" -b: specify the size of the memory region which should be\n"
116125
" accessed by each vCPU. e.g. 10M or 3G.\n"
117126
" Default: 1G\n");
@@ -137,7 +146,7 @@ int main(int argc, char *argv[])
137146

138147
guest_modes_append_default();
139148

140-
while ((opt = getopt(argc, argv, "hm:d:b:v:oi:")) != -1) {
149+
while ((opt = getopt(argc, argv, "hm:d:qb:v:oi:")) != -1) {
141150
switch (opt) {
142151
case 'm':
143152
guest_modes_cmdline(optarg);
@@ -160,6 +169,12 @@ int main(int argc, char *argv[])
160169
case 'i':
161170
p.nr_iterations = atoi_positive("Number of iterations", optarg);
162171
break;
172+
case 'q':
173+
p.disable_slot_zap_quirk = true;
174+
175+
TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) &
176+
KVM_X86_QUIRK_SLOT_ZAP_ALL);
177+
break;
163178
case 'h':
164179
default:
165180
help(argv[0]);

tools/testing/selftests/kvm/memslot_perf_test.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless");
113113
static sem_t vcpu_ready;
114114

115115
static bool map_unmap_verify;
116+
static bool disable_slot_zap_quirk;
116117

117118
static bool verbose;
118119
#define pr_info_v(...) \
@@ -578,6 +579,9 @@ static bool test_memslot_move_prepare(struct vm_data *data,
578579
uint32_t guest_page_size = data->vm->page_size;
579580
uint64_t movesrcgpa, movetestgpa;
580581

582+
if (disable_slot_zap_quirk)
583+
vm_enable_cap(data->vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
584+
581585
movesrcgpa = vm_slot2gpa(data, data->nslots - 1);
582586

583587
if (isactive) {
@@ -896,6 +900,7 @@ static void help(char *name, struct test_args *targs)
896900
pr_info(" -h: print this help screen.\n");
897901
pr_info(" -v: enable verbose mode (not for benchmarking).\n");
898902
pr_info(" -d: enable extra debug checks.\n");
903+
pr_info(" -q: Disable memslot zap quirk during memslot move.\n");
899904
pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n",
900905
targs->nslots);
901906
pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n",
@@ -954,7 +959,7 @@ static bool parse_args(int argc, char *argv[],
954959
uint32_t max_mem_slots;
955960
int opt;
956961

957-
while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) {
962+
while ((opt = getopt(argc, argv, "hvdqs:f:e:l:r:")) != -1) {
958963
switch (opt) {
959964
case 'h':
960965
default:
@@ -966,6 +971,11 @@ static bool parse_args(int argc, char *argv[],
966971
case 'd':
967972
map_unmap_verify = true;
968973
break;
974+
case 'q':
975+
disable_slot_zap_quirk = true;
976+
TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) &
977+
KVM_X86_QUIRK_SLOT_ZAP_ALL);
978+
break;
969979
case 's':
970980
targs->nslots = atoi_paranoid(optarg);
971981
if (targs->nslots <= 1 && targs->nslots != -1) {

tools/testing/selftests/kvm/set_memory_region_test.c

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ static void guest_code_move_memory_region(void)
175175
GUEST_DONE();
176176
}
177177

178-
static void test_move_memory_region(void)
178+
static void test_move_memory_region(bool disable_slot_zap_quirk)
179179
{
180180
pthread_t vcpu_thread;
181181
struct kvm_vcpu *vcpu;
@@ -184,6 +184,9 @@ static void test_move_memory_region(void)
184184

185185
vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_move_memory_region);
186186

187+
if (disable_slot_zap_quirk)
188+
vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
189+
187190
hva = addr_gpa2hva(vm, MEM_REGION_GPA);
188191

189192
/*
@@ -266,7 +269,7 @@ static void guest_code_delete_memory_region(void)
266269
GUEST_ASSERT(0);
267270
}
268271

269-
static void test_delete_memory_region(void)
272+
static void test_delete_memory_region(bool disable_slot_zap_quirk)
270273
{
271274
pthread_t vcpu_thread;
272275
struct kvm_vcpu *vcpu;
@@ -276,6 +279,9 @@ static void test_delete_memory_region(void)
276279

277280
vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_delete_memory_region);
278281

282+
if (disable_slot_zap_quirk)
283+
vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
284+
279285
/* Delete the memory region, the guest should not die. */
280286
vm_mem_region_delete(vm, MEM_REGION_SLOT);
281287
wait_for_vcpu();
@@ -553,7 +559,10 @@ int main(int argc, char *argv[])
553559
{
554560
#ifdef __x86_64__
555561
int i, loops;
562+
int j, disable_slot_zap_quirk = 0;
556563

564+
if (kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_SLOT_ZAP_ALL)
565+
disable_slot_zap_quirk = 1;
557566
/*
558567
* FIXME: the zero-memslot test fails on aarch64 and s390x because
559568
* KVM_RUN fails with ENOEXEC or EFAULT.
@@ -579,13 +588,17 @@ int main(int argc, char *argv[])
579588
else
580589
loops = 10;
581590

582-
pr_info("Testing MOVE of in-use region, %d loops\n", loops);
583-
for (i = 0; i < loops; i++)
584-
test_move_memory_region();
591+
for (j = 0; j <= disable_slot_zap_quirk; j++) {
592+
pr_info("Testing MOVE of in-use region, %d loops, slot zap quirk %s\n",
593+
loops, j ? "disabled" : "enabled");
594+
for (i = 0; i < loops; i++)
595+
test_move_memory_region(!!j);
585596

586-
pr_info("Testing DELETE of in-use region, %d loops\n", loops);
587-
for (i = 0; i < loops; i++)
588-
test_delete_memory_region();
597+
pr_info("Testing DELETE of in-use region, %d loops, slot zap quirk %s\n",
598+
loops, j ? "disabled" : "enabled");
599+
for (i = 0; i < loops; i++)
600+
test_delete_memory_region(!!j);
601+
}
589602
#endif
590603

591604
return 0;

0 commit comments

Comments
 (0)