Skip to content

Commit f3996d4

Browse files
committed
Merge branch 'kvm-prefault' into HEAD
Pre-population has been requested several times to mitigate KVM page faults during guest boot or after live migration. It is also required by TDX before filling in the initial guest memory with measured contents. Introduce it as a generic API.
2 parents eb162c9 + 9ff0e37 commit f3996d4

File tree

12 files changed

+394
-26
lines changed

12 files changed

+394
-26
lines changed

Documentation/virt/kvm/api.rst

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6352,6 +6352,61 @@ a single guest_memfd file, but the bound ranges must not overlap).
63526352

63536353
See KVM_SET_USER_MEMORY_REGION2 for additional details.
63546354

6355+
4.143 KVM_PRE_FAULT_MEMORY
6356+
------------------------
6357+
6358+
:Capability: KVM_CAP_PRE_FAULT_MEMORY
6359+
:Architectures: none
6360+
:Type: vcpu ioctl
6361+
:Parameters: struct kvm_pre_fault_memory (in/out)
6362+
:Returns: 0 if at least one page is processed, < 0 on error
6363+
6364+
Errors:
6365+
6366+
========== ===============================================================
6367+
EINVAL The specified `gpa` and `size` were invalid (e.g. not
6368+
page aligned, causes an overflow, or size is zero).
6369+
ENOENT The specified `gpa` is outside defined memslots.
6370+
EINTR An unmasked signal is pending and no page was processed.
6371+
EFAULT The parameter address was invalid.
6372+
EOPNOTSUPP Mapping memory for a GPA is unsupported by the
6373+
hypervisor, and/or for the current vCPU state/mode.
6374+
EIO unexpected error conditions (also causes a WARN)
6375+
========== ===============================================================
6376+
6377+
::
6378+
6379+
struct kvm_pre_fault_memory {
6380+
/* in/out */
6381+
__u64 gpa;
6382+
__u64 size;
6383+
/* in */
6384+
__u64 flags;
6385+
__u64 padding[5];
6386+
};
6387+
6388+
KVM_PRE_FAULT_MEMORY populates KVM's stage-2 page tables used to map memory
6389+
for the current vCPU state. KVM maps memory as if the vCPU generated a
6390+
stage-2 read page fault, e.g. faults in memory as needed, but doesn't break
6391+
CoW. However, KVM does not mark any newly created stage-2 PTE as Accessed.
6392+
6393+
In some cases, multiple vCPUs might share the page tables. In this
6394+
case, the ioctl can be called in parallel.
6395+
6396+
When the ioctl returns, the input values are updated to point to the
6397+
remaining range. If `size` > 0 on return, the caller can just issue
6398+
the ioctl again with the same `struct kvm_map_memory` argument.
6399+
6400+
Shadow page tables cannot support this ioctl because they
6401+
are indexed by virtual address or nested guest physical address.
6402+
Calling this ioctl when the guest is using shadow page tables (for
6403+
example because it is running a nested guest with nested page tables)
6404+
will fail with `EOPNOTSUPP` even if `KVM_CHECK_EXTENSION` reports
6405+
the capability to be present.
6406+
6407+
`flags` must currently be zero.
6408+
6409+
63556410
5. The kvm_run structure
63566411
========================
63576412

arch/x86/kvm/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ config KVM
4444
select KVM_VFIO
4545
select HAVE_KVM_PM_NOTIFIER if PM
4646
select KVM_GENERIC_HARDWARE_ENABLING
47+
select KVM_GENERIC_PRE_FAULT_MEMORY
4748
select KVM_WERROR if WERROR
4849
help
4950
Support hosting fully virtualized guest machines using hardware

arch/x86/kvm/mmu/mmu.c

Lines changed: 94 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4291,7 +4291,16 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
42914291
work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
42924292
return;
42934293

4294-
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL);
4294+
r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
4295+
true, NULL, NULL);
4296+
4297+
/*
4298+
* Account fixed page faults, otherwise they'll never be counted, but
4299+
* ignore stats for all other return times. Page-ready "faults" aren't
4300+
* truly spurious and never trigger emulation
4301+
*/
4302+
if (r == RET_PF_FIXED)
4303+
vcpu->stat.pf_fixed++;
42954304
}
42964305

42974306
static inline u8 kvm_max_level_for_order(int order)
@@ -4700,6 +4709,79 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
47004709
return direct_page_fault(vcpu, fault);
47014710
}
47024711

4712+
static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
4713+
u8 *level)
4714+
{
4715+
int r;
4716+
4717+
/*
4718+
* Restrict to TDP page fault, since that's the only case where the MMU
4719+
* is indexed by GPA.
4720+
*/
4721+
if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
4722+
return -EOPNOTSUPP;
4723+
4724+
do {
4725+
if (signal_pending(current))
4726+
return -EINTR;
4727+
cond_resched();
4728+
r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
4729+
} while (r == RET_PF_RETRY);
4730+
4731+
if (r < 0)
4732+
return r;
4733+
4734+
switch (r) {
4735+
case RET_PF_FIXED:
4736+
case RET_PF_SPURIOUS:
4737+
return 0;
4738+
4739+
case RET_PF_EMULATE:
4740+
return -ENOENT;
4741+
4742+
case RET_PF_RETRY:
4743+
case RET_PF_CONTINUE:
4744+
case RET_PF_INVALID:
4745+
default:
4746+
WARN_ONCE(1, "could not fix page fault during prefault");
4747+
return -EIO;
4748+
}
4749+
}
4750+
4751+
long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
4752+
struct kvm_pre_fault_memory *range)
4753+
{
4754+
u64 error_code = PFERR_GUEST_FINAL_MASK;
4755+
u8 level = PG_LEVEL_4K;
4756+
u64 end;
4757+
int r;
4758+
4759+
/*
4760+
* reload is efficient when called repeatedly, so we can do it on
4761+
* every iteration.
4762+
*/
4763+
kvm_mmu_reload(vcpu);
4764+
4765+
if (kvm_arch_has_private_mem(vcpu->kvm) &&
4766+
kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
4767+
error_code |= PFERR_PRIVATE_ACCESS;
4768+
4769+
/*
4770+
* Shadow paging uses GVA for kvm page fault, so restrict to
4771+
* two-dimensional paging.
4772+
*/
4773+
r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
4774+
if (r < 0)
4775+
return r;
4776+
4777+
/*
4778+
* If the mapping that covers range->gpa can use a huge page, it
4779+
* may start below it or end after range->gpa + range->size.
4780+
*/
4781+
end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
4782+
return min(range->size, end - range->gpa);
4783+
}
4784+
47034785
static void nonpaging_init_context(struct kvm_mmu *context)
47044786
{
47054787
context->page_fault = nonpaging_page_fault;
@@ -5925,14 +6007,24 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
59256007
}
59266008

59276009
if (r == RET_PF_INVALID) {
6010+
vcpu->stat.pf_taken++;
6011+
59286012
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
5929-
&emulation_type);
6013+
&emulation_type, NULL);
59306014
if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
59316015
return -EIO;
59326016
}
59336017

59346018
if (r < 0)
59356019
return r;
6020+
6021+
if (r == RET_PF_FIXED)
6022+
vcpu->stat.pf_fixed++;
6023+
else if (r == RET_PF_EMULATE)
6024+
vcpu->stat.pf_emulate++;
6025+
else if (r == RET_PF_SPURIOUS)
6026+
vcpu->stat.pf_spurious++;
6027+
59366028
if (r != RET_PF_EMULATE)
59376029
return 1;
59386030

arch/x86/kvm/mmu/mmu_internal.h

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,8 @@ static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
288288
}
289289

290290
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
291-
u64 err, bool prefetch, int *emulation_type)
291+
u64 err, bool prefetch,
292+
int *emulation_type, u8 *level)
292293
{
293294
struct kvm_page_fault fault = {
294295
.addr = cr2_or_gpa,
@@ -318,14 +319,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
318319
fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
319320
}
320321

321-
/*
322-
* Async #PF "faults", a.k.a. prefetch faults, are not faults from the
323-
* guest perspective and have already been counted at the time of the
324-
* original fault.
325-
*/
326-
if (!prefetch)
327-
vcpu->stat.pf_taken++;
328-
329322
if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
330323
r = kvm_tdp_page_fault(vcpu, &fault);
331324
else
@@ -344,20 +337,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
344337

345338
if (fault.write_fault_to_shadow_pgtable && emulation_type)
346339
*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
340+
if (level)
341+
*level = fault.goal_level;
347342

348-
/*
349-
* Similar to above, prefetch faults aren't truly spurious, and the
350-
* async #PF path doesn't do emulation. Do count faults that are fixed
351-
* by the async #PF handler though, otherwise they'll never be counted.
352-
*/
353-
if (r == RET_PF_FIXED)
354-
vcpu->stat.pf_fixed++;
355-
else if (prefetch)
356-
;
357-
else if (r == RET_PF_EMULATE)
358-
vcpu->stat.pf_emulate++;
359-
else if (r == RET_PF_SPURIOUS)
360-
vcpu->stat.pf_spurious++;
361343
return r;
362344
}
363345

arch/x86/kvm/x86.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4705,6 +4705,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
47054705
case KVM_CAP_MEMORY_FAULT_INFO:
47064706
r = 1;
47074707
break;
4708+
case KVM_CAP_PRE_FAULT_MEMORY:
4709+
r = tdp_enabled;
4710+
break;
47084711
case KVM_CAP_EXIT_HYPERCALL:
47094712
r = KVM_EXIT_HYPERCALL_VALID_MASK;
47104713
break;

include/linux/kvm_host.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2477,4 +2477,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages
24772477
void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
24782478
#endif
24792479

2480+
#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
2481+
long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
2482+
struct kvm_pre_fault_memory *range);
2483+
#endif
2484+
24802485
#endif

include/uapi/linux/kvm.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -917,6 +917,7 @@ struct kvm_enable_cap {
917917
#define KVM_CAP_MEMORY_ATTRIBUTES 233
918918
#define KVM_CAP_GUEST_MEMFD 234
919919
#define KVM_CAP_VM_TYPES 235
920+
#define KVM_CAP_PRE_FAULT_MEMORY 236
920921

921922
struct kvm_irq_routing_irqchip {
922923
__u32 irqchip;
@@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd {
15481549
__u64 reserved[6];
15491550
};
15501551

1552+
#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
1553+
1554+
struct kvm_pre_fault_memory {
1555+
__u64 gpa;
1556+
__u64 size;
1557+
__u64 flags;
1558+
__u64 padding[5];
1559+
};
1560+
15511561
#endif /* __LINUX_KVM_H */

tools/include/uapi/linux/kvm.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -917,6 +917,7 @@ struct kvm_enable_cap {
917917
#define KVM_CAP_MEMORY_ATTRIBUTES 233
918918
#define KVM_CAP_GUEST_MEMFD 234
919919
#define KVM_CAP_VM_TYPES 235
920+
#define KVM_CAP_PRE_FAULT_MEMORY 236
920921

921922
struct kvm_irq_routing_irqchip {
922923
__u32 irqchip;
@@ -1221,9 +1222,9 @@ struct kvm_vfio_spapr_tce {
12211222
/* Available with KVM_CAP_SPAPR_RESIZE_HPT */
12221223
#define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt)
12231224
#define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt)
1224-
/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_MMU_HASH_V3 */
1225+
/* Available with KVM_CAP_PPC_MMU_RADIX or KVM_CAP_PPC_MMU_HASH_V3 */
12251226
#define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg)
1226-
/* Available with KVM_CAP_PPC_RADIX_MMU */
1227+
/* Available with KVM_CAP_PPC_MMU_RADIX */
12271228
#define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info)
12281229
/* Available with KVM_CAP_PPC_GET_CPU_CHAR */
12291230
#define KVM_PPC_GET_CPU_CHAR _IOR(KVMIO, 0xb1, struct kvm_ppc_cpu_char)
@@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd {
15481549
__u64 reserved[6];
15491550
};
15501551

1552+
#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
1553+
1554+
struct kvm_pre_fault_memory {
1555+
__u64 gpa;
1556+
__u64 size;
1557+
__u64 flags;
1558+
__u64 padding[5];
1559+
};
1560+
15511561
#endif /* __LINUX_KVM_H */

tools/testing/selftests/kvm/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ TEST_GEN_PROGS_x86_64 += set_memory_region_test
145145
TEST_GEN_PROGS_x86_64 += steal_time
146146
TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
147147
TEST_GEN_PROGS_x86_64 += system_counter_offset_test
148+
TEST_GEN_PROGS_x86_64 += pre_fault_memory_test
148149

149150
# Compiled outputs used by test targets
150151
TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test

0 commit comments

Comments
 (0)