Skip to content

Commit bce58da

Browse files
committed
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini: "x86: - Account for family 17h event renumberings in AMD PMU emulation - Remove CPUID leaf 0xA on AMD processors - Fix lockdep issue with locking all vCPUs - Fix loss of A/D bits in SPTEs - Fix syzkaller issue with invalid guest state" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: VMX: Exit to userspace if vCPU has injected exception and invalid state KVM: SEV: Mark nested locking of vcpu->lock kvm: x86/cpuid: Only provide CPUID leaf 0xA if host has architectural PMU KVM: x86/svm: Account for family 17h event renumberings in amd_pmc_perf_hw_id KVM: x86/mmu: Use atomic XCHG to write TDP MMU SPTEs with volatile bits KVM: x86/mmu: Move shadow-present check out of spte_has_volatile_bits() KVM: x86/mmu: Don't treat fully writable SPTEs as volatile (modulo A/D)
2 parents 497fe3b + 053d229 commit bce58da

File tree

9 files changed

+190
-69
lines changed

9 files changed

+190
-69
lines changed

arch/x86/kvm/cpuid.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -887,6 +887,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
887887
union cpuid10_eax eax;
888888
union cpuid10_edx edx;
889889

890+
if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
891+
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
892+
break;
893+
}
894+
890895
perf_get_x86_pmu_capability(&cap);
891896

892897
/*

arch/x86/kvm/mmu/mmu.c

Lines changed: 5 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -473,30 +473,6 @@ static u64 __get_spte_lockless(u64 *sptep)
473473
}
474474
#endif
475475

476-
static bool spte_has_volatile_bits(u64 spte)
477-
{
478-
if (!is_shadow_present_pte(spte))
479-
return false;
480-
481-
/*
482-
* Always atomically update spte if it can be updated
483-
* out of mmu-lock, it can ensure dirty bit is not lost,
484-
* also, it can help us to get a stable is_writable_pte()
485-
* to ensure tlb flush is not missed.
486-
*/
487-
if (spte_can_locklessly_be_made_writable(spte) ||
488-
is_access_track_spte(spte))
489-
return true;
490-
491-
if (spte_ad_enabled(spte)) {
492-
if ((spte & shadow_accessed_mask) == 0 ||
493-
(is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
494-
return true;
495-
}
496-
497-
return false;
498-
}
499-
500476
/* Rules for using mmu_spte_set:
501477
* Set the sptep from nonpresent to present.
502478
* Note: the sptep being assigned *must* be either not present
@@ -557,7 +533,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
557533
* we always atomically update it, see the comments in
558534
* spte_has_volatile_bits().
559535
*/
560-
if (spte_can_locklessly_be_made_writable(old_spte) &&
536+
if (is_mmu_writable_spte(old_spte) &&
561537
!is_writable_pte(new_spte))
562538
flush = true;
563539

@@ -591,7 +567,8 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
591567
u64 old_spte = *sptep;
592568
int level = sptep_to_sp(sptep)->role.level;
593569

594-
if (!spte_has_volatile_bits(old_spte))
570+
if (!is_shadow_present_pte(old_spte) ||
571+
!spte_has_volatile_bits(old_spte))
595572
__update_clear_spte_fast(sptep, 0ull);
596573
else
597574
old_spte = __update_clear_spte_slow(sptep, 0ull);
@@ -1187,7 +1164,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
11871164
u64 spte = *sptep;
11881165

11891166
if (!is_writable_pte(spte) &&
1190-
!(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1167+
!(pt_protect && is_mmu_writable_spte(spte)))
11911168
return false;
11921169

11931170
rmap_printk("spte %p %llx\n", sptep, *sptep);
@@ -3196,8 +3173,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
31963173
* be removed in the fast path only if the SPTE was
31973174
* write-protected for dirty-logging or access tracking.
31983175
*/
3199-
if (fault->write &&
3200-
spte_can_locklessly_be_made_writable(spte)) {
3176+
if (fault->write && is_mmu_writable_spte(spte)) {
32013177
new_spte |= PT_WRITABLE_MASK;
32023178

32033179
/*

arch/x86/kvm/mmu/spte.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,34 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
9090
E820_TYPE_RAM);
9191
}
9292

93+
/*
94+
* Returns true if the SPTE has bits that may be set without holding mmu_lock.
95+
* The caller is responsible for checking if the SPTE is shadow-present, and
96+
* for determining whether or not the caller cares about non-leaf SPTEs.
97+
*/
98+
bool spte_has_volatile_bits(u64 spte)
99+
{
100+
/*
101+
* Always atomically update spte if it can be updated
102+
* out of mmu-lock, it can ensure dirty bit is not lost,
103+
* also, it can help us to get a stable is_writable_pte()
104+
* to ensure tlb flush is not missed.
105+
*/
106+
if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
107+
return true;
108+
109+
if (is_access_track_spte(spte))
110+
return true;
111+
112+
if (spte_ad_enabled(spte)) {
113+
if (!(spte & shadow_accessed_mask) ||
114+
(is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
115+
return true;
116+
}
117+
118+
return false;
119+
}
120+
93121
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
94122
const struct kvm_memory_slot *slot,
95123
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,

arch/x86/kvm/mmu/spte.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ static inline void check_spte_writable_invariants(u64 spte)
390390
"kvm: Writable SPTE is not MMU-writable: %llx", spte);
391391
}
392392

393-
static inline bool spte_can_locklessly_be_made_writable(u64 spte)
393+
static inline bool is_mmu_writable_spte(u64 spte)
394394
{
395395
return spte & shadow_mmu_writable_mask;
396396
}
@@ -404,6 +404,8 @@ static inline u64 get_mmio_spte_generation(u64 spte)
404404
return gen;
405405
}
406406

407+
bool spte_has_volatile_bits(u64 spte);
408+
407409
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
408410
const struct kvm_memory_slot *slot,
409411
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,

arch/x86/kvm/mmu/tdp_iter.h

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <linux/kvm_host.h>
77

88
#include "mmu.h"
9+
#include "spte.h"
910

1011
/*
1112
* TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
@@ -17,9 +18,38 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
1718
{
1819
return READ_ONCE(*rcu_dereference(sptep));
1920
}
20-
static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val)
21+
22+
static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
23+
{
24+
return xchg(rcu_dereference(sptep), new_spte);
25+
}
26+
27+
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
28+
{
29+
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
30+
}
31+
32+
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
33+
u64 new_spte, int level)
2134
{
22-
WRITE_ONCE(*rcu_dereference(sptep), val);
35+
/*
36+
* Atomically write the SPTE if it is a shadow-present, leaf SPTE with
37+
* volatile bits, i.e. has bits that can be set outside of mmu_lock.
38+
* The Writable bit can be set by KVM's fast page fault handler, and
39+
* Accessed and Dirty bits can be set by the CPU.
40+
*
41+
* Note, non-leaf SPTEs do have Accessed bits and those bits are
42+
* technically volatile, but KVM doesn't consume the Accessed bit of
43+
* non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
44+
* logic needs to be reassessed if KVM were to use non-leaf Accessed
45+
* bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
46+
*/
47+
if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
48+
spte_has_volatile_bits(old_spte))
49+
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
50+
51+
__kvm_tdp_mmu_write_spte(sptep, new_spte);
52+
return old_spte;
2353
}
2454

2555
/*

arch/x86/kvm/mmu/tdp_mmu.c

Lines changed: 53 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
426426
tdp_mmu_unlink_sp(kvm, sp, shared);
427427

428428
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
429-
u64 *sptep = rcu_dereference(pt) + i;
429+
tdp_ptep_t sptep = pt + i;
430430
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
431-
u64 old_child_spte;
431+
u64 old_spte;
432432

433433
if (shared) {
434434
/*
@@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
440440
* value to the removed SPTE value.
441441
*/
442442
for (;;) {
443-
old_child_spte = xchg(sptep, REMOVED_SPTE);
444-
if (!is_removed_spte(old_child_spte))
443+
old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
444+
if (!is_removed_spte(old_spte))
445445
break;
446446
cpu_relax();
447447
}
@@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
455455
* are guarded by the memslots generation, not by being
456456
* unreachable.
457457
*/
458-
old_child_spte = READ_ONCE(*sptep);
459-
if (!is_shadow_present_pte(old_child_spte))
458+
old_spte = kvm_tdp_mmu_read_spte(sptep);
459+
if (!is_shadow_present_pte(old_spte))
460460
continue;
461461

462462
/*
463-
* Marking the SPTE as a removed SPTE is not
464-
* strictly necessary here as the MMU lock will
465-
* stop other threads from concurrently modifying
466-
* this SPTE. Using the removed SPTE value keeps
467-
* the two branches consistent and simplifies
468-
* the function.
463+
* Use the common helper instead of a raw WRITE_ONCE as
464+
* the SPTE needs to be updated atomically if it can be
465+
* modified by a different vCPU outside of mmu_lock.
466+
* Even though the parent SPTE is !PRESENT, the TLB
467+
* hasn't yet been flushed, and both Intel and AMD
468+
* document that A/D assists can use upper-level PxE
469+
* entries that are cached in the TLB, i.e. the CPU can
470+
* still access the page and mark it dirty.
471+
*
472+
* No retry is needed in the atomic update path as the
473+
* sole concern is dropping a Dirty bit, i.e. no other
474+
* task can zap/remove the SPTE as mmu_lock is held for
475+
* write. Marking the SPTE as a removed SPTE is not
476+
* strictly necessary for the same reason, but using
477+
* the remove SPTE value keeps the shared/exclusive
478+
* paths consistent and allows the handle_changed_spte()
479+
* call below to hardcode the new value to REMOVED_SPTE.
480+
*
481+
* Note, even though dropping a Dirty bit is the only
482+
* scenario where a non-atomic update could result in a
483+
* functional bug, simply checking the Dirty bit isn't
484+
* sufficient as a fast page fault could read the upper
485+
* level SPTE before it is zapped, and then make this
486+
* target SPTE writable, resume the guest, and set the
487+
* Dirty bit between reading the SPTE above and writing
488+
* it here.
469489
*/
470-
WRITE_ONCE(*sptep, REMOVED_SPTE);
490+
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
491+
REMOVED_SPTE, level);
471492
}
472493
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
473-
old_child_spte, REMOVED_SPTE, level,
474-
shared);
494+
old_spte, REMOVED_SPTE, level, shared);
475495
}
476496

477497
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
667687
KVM_PAGES_PER_HPAGE(iter->level));
668688

669689
/*
670-
* No other thread can overwrite the removed SPTE as they
671-
* must either wait on the MMU lock or use
672-
* tdp_mmu_set_spte_atomic which will not overwrite the
673-
* special removed SPTE value. No bookkeeping is needed
674-
* here since the SPTE is going from non-present
675-
* to non-present.
690+
* No other thread can overwrite the removed SPTE as they must either
691+
* wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
692+
* overwrite the special removed SPTE value. No bookkeeping is needed
693+
* here since the SPTE is going from non-present to non-present. Use
694+
* the raw write helper to avoid an unnecessary check on volatile bits.
676695
*/
677-
kvm_tdp_mmu_write_spte(iter->sptep, 0);
696+
__kvm_tdp_mmu_write_spte(iter->sptep, 0);
678697

679698
return 0;
680699
}
@@ -699,10 +718,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
699718
* unless performing certain dirty logging operations.
700719
* Leaving record_dirty_log unset in that case prevents page
701720
* writes from being double counted.
721+
*
722+
* Returns the old SPTE value, which _may_ be different than @old_spte if the
723+
* SPTE had voldatile bits.
702724
*/
703-
static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
704-
u64 old_spte, u64 new_spte, gfn_t gfn, int level,
705-
bool record_acc_track, bool record_dirty_log)
725+
static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
726+
u64 old_spte, u64 new_spte, gfn_t gfn, int level,
727+
bool record_acc_track, bool record_dirty_log)
706728
{
707729
lockdep_assert_held_write(&kvm->mmu_lock);
708730

@@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
715737
*/
716738
WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
717739

718-
kvm_tdp_mmu_write_spte(sptep, new_spte);
740+
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
719741

720742
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
721743

@@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
724746
if (record_dirty_log)
725747
handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
726748
new_spte, level);
749+
return old_spte;
727750
}
728751

729752
static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
@@ -732,9 +755,10 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
732755
{
733756
WARN_ON_ONCE(iter->yielded);
734757

735-
__tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
736-
new_spte, iter->gfn, iter->level,
737-
record_acc_track, record_dirty_log);
758+
iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
759+
iter->old_spte, new_spte,
760+
iter->gfn, iter->level,
761+
record_acc_track, record_dirty_log);
738762
}
739763

740764
static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,

arch/x86/kvm/svm/pmu.c

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,22 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
4545
[7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
4646
};
4747

48+
/* duplicated from amd_f17h_perfmon_event_map. */
49+
static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = {
50+
[0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
51+
[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
52+
[2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES },
53+
[3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES },
54+
[4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
55+
[5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
56+
[6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
57+
[7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
58+
};
59+
60+
/* amd_pmc_perf_hw_id depends on these being the same size */
61+
static_assert(ARRAY_SIZE(amd_event_mapping) ==
62+
ARRAY_SIZE(amd_f17h_event_mapping));
63+
4864
static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
4965
{
5066
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
@@ -140,6 +156,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
140156

141157
static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
142158
{
159+
struct kvm_event_hw_type_mapping *event_mapping;
143160
u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
144161
u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
145162
int i;
@@ -148,15 +165,20 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
148165
if (WARN_ON(pmc_is_fixed(pmc)))
149166
return PERF_COUNT_HW_MAX;
150167

168+
if (guest_cpuid_family(pmc->vcpu) >= 0x17)
169+
event_mapping = amd_f17h_event_mapping;
170+
else
171+
event_mapping = amd_event_mapping;
172+
151173
for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
152-
if (amd_event_mapping[i].eventsel == event_select
153-
&& amd_event_mapping[i].unit_mask == unit_mask)
174+
if (event_mapping[i].eventsel == event_select
175+
&& event_mapping[i].unit_mask == unit_mask)
154176
break;
155177

156178
if (i == ARRAY_SIZE(amd_event_mapping))
157179
return PERF_COUNT_HW_MAX;
158180

159-
return amd_event_mapping[i].event_type;
181+
return event_mapping[i].event_type;
160182
}
161183

162184
/* check if a PMC is enabled by comparing it against global_ctrl bits. Because

0 commit comments

Comments
 (0)