Skip to content

Commit f365084

Browse files
committed
Merge branch 'kvm-coco-pagefault-prep' into HEAD
A combination of prep work for TDX and SNP, and a clean up of the page fault path to (hopefully) make it easier to follow the rules for private memory, noslot faults, writes to read-only slots, etc.
2 parents 1e21b53 + 2b1f435 commit f365084

File tree

6 files changed

+174
-98
lines changed

6 files changed

+174
-98
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -254,28 +254,31 @@ enum x86_intercept_stage;
254254
KVM_GUESTDBG_INJECT_DB | \
255255
KVM_GUESTDBG_BLOCKIRQ)
256256

257+
#define PFERR_PRESENT_MASK BIT(0)
258+
#define PFERR_WRITE_MASK BIT(1)
259+
#define PFERR_USER_MASK BIT(2)
260+
#define PFERR_RSVD_MASK BIT(3)
261+
#define PFERR_FETCH_MASK BIT(4)
262+
#define PFERR_PK_MASK BIT(5)
263+
#define PFERR_SGX_MASK BIT(15)
264+
#define PFERR_GUEST_RMP_MASK BIT_ULL(31)
265+
#define PFERR_GUEST_FINAL_MASK BIT_ULL(32)
266+
#define PFERR_GUEST_PAGE_MASK BIT_ULL(33)
267+
#define PFERR_GUEST_ENC_MASK BIT_ULL(34)
268+
#define PFERR_GUEST_SIZEM_MASK BIT_ULL(35)
269+
#define PFERR_GUEST_VMPL_MASK BIT_ULL(36)
257270

258-
#define PFERR_PRESENT_BIT 0
259-
#define PFERR_WRITE_BIT 1
260-
#define PFERR_USER_BIT 2
261-
#define PFERR_RSVD_BIT 3
262-
#define PFERR_FETCH_BIT 4
263-
#define PFERR_PK_BIT 5
264-
#define PFERR_SGX_BIT 15
265-
#define PFERR_GUEST_FINAL_BIT 32
266-
#define PFERR_GUEST_PAGE_BIT 33
267-
#define PFERR_IMPLICIT_ACCESS_BIT 48
268-
269-
#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT)
270-
#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT)
271-
#define PFERR_USER_MASK BIT(PFERR_USER_BIT)
272-
#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT)
273-
#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT)
274-
#define PFERR_PK_MASK BIT(PFERR_PK_BIT)
275-
#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT)
276-
#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT)
277-
#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT)
278-
#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
271+
/*
272+
* IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks
273+
* when emulating instructions that triggers implicit access.
274+
*/
275+
#define PFERR_IMPLICIT_ACCESS BIT_ULL(48)
276+
/*
277+
* PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred
278+
* when the guest was accessing private memory.
279+
*/
280+
#define PFERR_PRIVATE_ACCESS BIT_ULL(49)
281+
#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
279282

280283
#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
281284
PFERR_WRITE_MASK | \
@@ -1848,6 +1851,7 @@ struct kvm_arch_async_pf {
18481851
gfn_t gfn;
18491852
unsigned long cr3;
18501853
bool direct_map;
1854+
u64 error_code;
18511855
};
18521856

18531857
extern u32 __read_mostly kvm_nr_uret_msrs;

arch/x86/kvm/mmu.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
213213
*/
214214
u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
215215
bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
216-
int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1;
216+
int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1;
217217
u32 errcode = PFERR_PRESENT_MASK;
218218
bool fault;
219219

@@ -234,8 +234,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
234234
pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
235235

236236
/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
237-
offset = (pfec & ~1) +
238-
((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
237+
offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0);
239238

240239
pkru_bits &= mmu->pkru_mask >> offset;
241240
errcode |= -pkru_bits & PFERR_PK_MASK;

arch/x86/kvm/mmu/mmu.c

Lines changed: 112 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -3262,9 +3262,19 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
32623262
{
32633263
gva_t gva = fault->is_tdp ? 0 : fault->addr;
32643264

3265+
if (fault->is_private) {
3266+
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
3267+
return -EFAULT;
3268+
}
3269+
32653270
vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
32663271
access & shadow_mmio_access_mask);
32673272

3273+
fault->slot = NULL;
3274+
fault->pfn = KVM_PFN_NOSLOT;
3275+
fault->map_writable = false;
3276+
fault->hva = KVM_HVA_ERR_BAD;
3277+
32683278
/*
32693279
* If MMIO caching is disabled, emulate immediately without
32703280
* touching the shadow page tables as attempting to install an
@@ -4207,24 +4217,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
42074217
return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
42084218
}
42094219

4210-
static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
4211-
gfn_t gfn)
4220+
static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
4221+
struct kvm_page_fault *fault)
42124222
{
42134223
struct kvm_arch_async_pf arch;
42144224

42154225
arch.token = alloc_apf_token(vcpu);
4216-
arch.gfn = gfn;
4226+
arch.gfn = fault->gfn;
4227+
arch.error_code = fault->error_code;
42174228
arch.direct_map = vcpu->arch.mmu->root_role.direct;
42184229
arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
42194230

4220-
return kvm_setup_async_pf(vcpu, cr2_or_gpa,
4221-
kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4231+
return kvm_setup_async_pf(vcpu, fault->addr,
4232+
kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
42224233
}
42234234

42244235
void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
42254236
{
42264237
int r;
42274238

4239+
if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS))
4240+
return;
4241+
42284242
if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
42294243
work->wakeup_all)
42304244
return;
@@ -4237,7 +4251,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
42374251
work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
42384252
return;
42394253

4240-
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
4254+
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL);
42414255
}
42424256

42434257
static inline u8 kvm_max_level_for_order(int order)
@@ -4257,14 +4271,6 @@ static inline u8 kvm_max_level_for_order(int order)
42574271
return PG_LEVEL_4K;
42584272
}
42594273

4260-
static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
4261-
struct kvm_page_fault *fault)
4262-
{
4263-
kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
4264-
PAGE_SIZE, fault->write, fault->exec,
4265-
fault->is_private);
4266-
}
4267-
42684274
static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
42694275
struct kvm_page_fault *fault)
42704276
{
@@ -4291,48 +4297,15 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
42914297

42924298
static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
42934299
{
4294-
struct kvm_memory_slot *slot = fault->slot;
42954300
bool async;
42964301

4297-
/*
4298-
* Retry the page fault if the gfn hit a memslot that is being deleted
4299-
* or moved. This ensures any existing SPTEs for the old memslot will
4300-
* be zapped before KVM inserts a new MMIO SPTE for the gfn.
4301-
*/
4302-
if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
4303-
return RET_PF_RETRY;
4304-
4305-
if (!kvm_is_visible_memslot(slot)) {
4306-
/* Don't expose private memslots to L2. */
4307-
if (is_guest_mode(vcpu)) {
4308-
fault->slot = NULL;
4309-
fault->pfn = KVM_PFN_NOSLOT;
4310-
fault->map_writable = false;
4311-
return RET_PF_CONTINUE;
4312-
}
4313-
/*
4314-
* If the APIC access page exists but is disabled, go directly
4315-
* to emulation without caching the MMIO access or creating a
4316-
* MMIO SPTE. That way the cache doesn't need to be purged
4317-
* when the AVIC is re-enabled.
4318-
*/
4319-
if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
4320-
!kvm_apicv_activated(vcpu->kvm))
4321-
return RET_PF_EMULATE;
4322-
}
4323-
4324-
if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
4325-
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
4326-
return -EFAULT;
4327-
}
4328-
43294302
if (fault->is_private)
43304303
return kvm_faultin_pfn_private(vcpu, fault);
43314304

43324305
async = false;
4333-
fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
4334-
fault->write, &fault->map_writable,
4335-
&fault->hva);
4306+
fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false,
4307+
&async, fault->write,
4308+
&fault->map_writable, &fault->hva);
43364309
if (!async)
43374310
return RET_PF_CONTINUE; /* *pfn has correct page already */
43384311

@@ -4342,7 +4315,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
43424315
trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
43434316
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
43444317
return RET_PF_RETRY;
4345-
} else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
4318+
} else if (kvm_arch_setup_async_pf(vcpu, fault)) {
43464319
return RET_PF_RETRY;
43474320
}
43484321
}
@@ -4352,17 +4325,72 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
43524325
* to wait for IO. Note, gup always bails if it is unable to quickly
43534326
* get a page and a fatal signal, i.e. SIGKILL, is pending.
43544327
*/
4355-
fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
4356-
fault->write, &fault->map_writable,
4357-
&fault->hva);
4328+
fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true,
4329+
NULL, fault->write,
4330+
&fault->map_writable, &fault->hva);
43584331
return RET_PF_CONTINUE;
43594332
}
43604333

43614334
static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
43624335
unsigned int access)
43634336
{
4337+
struct kvm_memory_slot *slot = fault->slot;
43644338
int ret;
43654339

4340+
/*
4341+
* Note that the mmu_invalidate_seq also serves to detect a concurrent
4342+
* change in attributes. is_page_fault_stale() will detect an
4343+
* invalidation relate to fault->fn and resume the guest without
4344+
* installing a mapping in the page tables.
4345+
*/
4346+
fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
4347+
smp_rmb();
4348+
4349+
/*
4350+
* Now that we have a snapshot of mmu_invalidate_seq we can check for a
4351+
* private vs. shared mismatch.
4352+
*/
4353+
if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
4354+
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
4355+
return -EFAULT;
4356+
}
4357+
4358+
if (unlikely(!slot))
4359+
return kvm_handle_noslot_fault(vcpu, fault, access);
4360+
4361+
/*
4362+
* Retry the page fault if the gfn hit a memslot that is being deleted
4363+
* or moved. This ensures any existing SPTEs for the old memslot will
4364+
* be zapped before KVM inserts a new MMIO SPTE for the gfn.
4365+
*/
4366+
if (slot->flags & KVM_MEMSLOT_INVALID)
4367+
return RET_PF_RETRY;
4368+
4369+
if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
4370+
/*
4371+
* Don't map L1's APIC access page into L2, KVM doesn't support
4372+
* using APICv/AVIC to accelerate L2 accesses to L1's APIC,
4373+
* i.e. the access needs to be emulated. Emulating access to
4374+
* L1's APIC is also correct if L1 is accelerating L2's own
4375+
* virtual APIC, but for some reason L1 also maps _L1's_ APIC
4376+
* into L2. Note, vcpu_is_mmio_gpa() always treats access to
4377+
* the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM
4378+
* uses different roots for L1 vs. L2, i.e. there is no danger
4379+
* of breaking APICv/AVIC for L1.
4380+
*/
4381+
if (is_guest_mode(vcpu))
4382+
return kvm_handle_noslot_fault(vcpu, fault, access);
4383+
4384+
/*
4385+
* If the APIC access page exists but is disabled, go directly
4386+
* to emulation without caching the MMIO access or creating a
4387+
* MMIO SPTE. That way the cache doesn't need to be purged
4388+
* when the AVIC is re-enabled.
4389+
*/
4390+
if (!kvm_apicv_activated(vcpu->kvm))
4391+
return RET_PF_EMULATE;
4392+
}
4393+
43664394
fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
43674395
smp_rmb();
43684396

@@ -4387,8 +4415,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
43874415
* *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
43884416
* to detect retry guarantees the worst case latency for the vCPU.
43894417
*/
4390-
if (fault->slot &&
4391-
mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
4418+
if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
43924419
return RET_PF_RETRY;
43934420

43944421
ret = __kvm_faultin_pfn(vcpu, fault);
@@ -4398,7 +4425,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
43984425
if (unlikely(is_error_pfn(fault->pfn)))
43994426
return kvm_handle_error_pfn(vcpu, fault);
44004427

4401-
if (unlikely(!fault->slot))
4428+
if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn)))
44024429
return kvm_handle_noslot_fault(vcpu, fault, access);
44034430

44044431
/*
@@ -4509,6 +4536,16 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
45094536
if (WARN_ON_ONCE(fault_address >> 32))
45104537
return -EFAULT;
45114538
#endif
4539+
/*
4540+
* Legacy #PF exception only have a 32-bit error code. Simply drop the
4541+
* upper bits as KVM doesn't use them for #PF (because they are never
4542+
* set), and to ensure there are no collisions with KVM-defined bits.
4543+
*/
4544+
if (WARN_ON_ONCE(error_code >> 32))
4545+
error_code = lower_32_bits(error_code);
4546+
4547+
/* Ensure the above sanity check also covers KVM-defined flags. */
4548+
BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
45124549

45134550
vcpu->arch.l1tf_flush_l1d = true;
45144551
if (!flags) {
@@ -5794,30 +5831,35 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
57945831
int r, emulation_type = EMULTYPE_PF;
57955832
bool direct = vcpu->arch.mmu->root_role.direct;
57965833

5797-
/*
5798-
* IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
5799-
* checks when emulating instructions that triggers implicit access.
5800-
* WARN if hardware generates a fault with an error code that collides
5801-
* with the KVM-defined value. Clear the flag and continue on, i.e.
5802-
* don't terminate the VM, as KVM can't possibly be relying on a flag
5803-
* that KVM doesn't know about.
5804-
*/
5805-
if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
5806-
error_code &= ~PFERR_IMPLICIT_ACCESS;
5807-
58085834
if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
58095835
return RET_PF_RETRY;
58105836

5837+
/*
5838+
* Except for reserved faults (emulated MMIO is shared-only), set the
5839+
* PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's
5840+
* current attributes, which are the source of truth for such VMs. Note,
5841+
* this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't
5842+
* currently supported nested virtualization (among many other things)
5843+
* for software-protected VMs.
5844+
*/
5845+
if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
5846+
!(error_code & PFERR_RSVD_MASK) &&
5847+
vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM &&
5848+
kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)))
5849+
error_code |= PFERR_PRIVATE_ACCESS;
5850+
58115851
r = RET_PF_INVALID;
58125852
if (unlikely(error_code & PFERR_RSVD_MASK)) {
5853+
if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS))
5854+
return -EFAULT;
5855+
58135856
r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
58145857
if (r == RET_PF_EMULATE)
58155858
goto emulate;
58165859
}
58175860

58185861
if (r == RET_PF_INVALID) {
5819-
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
5820-
lower_32_bits(error_code), false,
5862+
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
58215863
&emulation_type);
58225864
if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
58235865
return -EIO;

0 commit comments

Comments
 (0)