Skip to content

Commit 8c5bd25

Browse files
committed
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini: "Fix unwinding of KVM_CREATE_VM failure, VT-d posted interrupts, DAX/ZONE_DEVICE, and module unload/reload" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: MMU: Do not treat ZONE_DEVICE pages as being reserved KVM: VMX: Introduce pi_is_pir_empty() helper KVM: VMX: Do not change PID.NDST when loading a blocked vCPU KVM: VMX: Consider PID.PIR to determine if vCPU has pending interrupts KVM: VMX: Fix comment to specify PID.ON instead of PIR.ON KVM: X86: Fix initialization of MSR lists KVM: fix placement of refcount initialization KVM: Fix NULL-ptr deref after kvm_create_vm fails
2 parents eb094f0 + a78986a commit 8c5bd25

File tree

6 files changed

+96
-51
lines changed

6 files changed

+96
-51
lines changed

arch/x86/kvm/mmu.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3393,7 +3393,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
33933393
* here.
33943394
*/
33953395
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
3396-
level == PT_PAGE_TABLE_LEVEL &&
3396+
!kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
33973397
PageTransCompoundMap(pfn_to_page(pfn)) &&
33983398
!mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
33993399
unsigned long mask;
@@ -6009,9 +6009,9 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
60096009
* the guest, and the guest page table is using 4K page size
60106010
* mapping if the indirect sp has level = 1.
60116011
*/
6012-
if (sp->role.direct &&
6013-
!kvm_is_reserved_pfn(pfn) &&
6014-
PageTransCompoundMap(pfn_to_page(pfn))) {
6012+
if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
6013+
!kvm_is_zone_device_pfn(pfn) &&
6014+
PageTransCompoundMap(pfn_to_page(pfn))) {
60156015
pte_list_remove(rmap_head, sptep);
60166016

60176017
if (kvm_available_flush_tlb_with_range())

arch/x86/kvm/vmx/vmx.c

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1268,6 +1268,18 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
12681268
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
12691269
return;
12701270

1271+
/*
1272+
* If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
1273+
* PI.NDST: pi_post_block is the one expected to change PID.NDST and the
1274+
* wakeup handler expects the vCPU to be on the blocked_vcpu_list that
1275+
* matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
1276+
* correctly.
1277+
*/
1278+
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
1279+
pi_clear_sn(pi_desc);
1280+
goto after_clear_sn;
1281+
}
1282+
12711283
/* The full case. */
12721284
do {
12731285
old.control = new.control = pi_desc->control;
@@ -1283,6 +1295,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
12831295
} while (cmpxchg64(&pi_desc->control, old.control,
12841296
new.control) != old.control);
12851297

1298+
after_clear_sn:
1299+
12861300
/*
12871301
* Clear SN before reading the bitmap. The VT-d firmware
12881302
* writes the bitmap and reads SN atomically (5.2.3 in the
@@ -1291,7 +1305,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
12911305
*/
12921306
smp_mb__after_atomic();
12931307

1294-
if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
1308+
if (!pi_is_pir_empty(pi_desc))
12951309
pi_set_on(pi_desc);
12961310
}
12971311

@@ -6137,7 +6151,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
61376151
if (pi_test_on(&vmx->pi_desc)) {
61386152
pi_clear_on(&vmx->pi_desc);
61396153
/*
6140-
* IOMMU can write to PIR.ON, so the barrier matters even on UP.
6154+
* IOMMU can write to PID.ON, so the barrier matters even on UP.
61416155
* But on x86 this is just a compiler barrier anyway.
61426156
*/
61436157
smp_mb__after_atomic();
@@ -6167,7 +6181,10 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
61676181

61686182
static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
61696183
{
6170-
return pi_test_on(vcpu_to_pi_desc(vcpu));
6184+
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
6185+
6186+
return pi_test_on(pi_desc) ||
6187+
(pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
61716188
}
61726189

61736190
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)

arch/x86/kvm/vmx/vmx.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,11 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
355355
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
356356
}
357357

358+
static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
359+
{
360+
return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
361+
}
362+
358363
static inline void pi_set_sn(struct pi_desc *pi_desc)
359364
{
360365
set_bit(POSTED_INTR_SN,
@@ -373,6 +378,12 @@ static inline void pi_clear_on(struct pi_desc *pi_desc)
373378
(unsigned long *)&pi_desc->control);
374379
}
375380

381+
static inline void pi_clear_sn(struct pi_desc *pi_desc)
382+
{
383+
clear_bit(POSTED_INTR_SN,
384+
(unsigned long *)&pi_desc->control);
385+
}
386+
376387
static inline int pi_test_on(struct pi_desc *pi_desc)
377388
{
378389
return test_bit(POSTED_INTR_ON,

arch/x86/kvm/x86.c

Lines changed: 26 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,13 +1133,15 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
11331133
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
11341134
* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
11351135
*
1136-
* This list is modified at module load time to reflect the
1136+
* The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
1137+
* extract the supported MSRs from the related const lists.
1138+
* msrs_to_save is selected from the msrs_to_save_all to reflect the
11371139
* capabilities of the host cpu. This capabilities test skips MSRs that are
1138-
* kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
1140+
* kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
11391141
* may depend on host virtualization features rather than host cpu features.
11401142
*/
11411143

1142-
static u32 msrs_to_save[] = {
1144+
static const u32 msrs_to_save_all[] = {
11431145
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
11441146
MSR_STAR,
11451147
#ifdef CONFIG_X86_64
@@ -1180,9 +1182,10 @@ static u32 msrs_to_save[] = {
11801182
MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
11811183
};
11821184

1185+
static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
11831186
static unsigned num_msrs_to_save;
11841187

1185-
static u32 emulated_msrs[] = {
1188+
static const u32 emulated_msrs_all[] = {
11861189
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
11871190
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
11881191
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
@@ -1221,7 +1224,7 @@ static u32 emulated_msrs[] = {
12211224
* by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
12221225
* We always support the "true" VMX control MSRs, even if the host
12231226
* processor does not, so I am putting these registers here rather
1224-
* than in msrs_to_save.
1227+
* than in msrs_to_save_all.
12251228
*/
12261229
MSR_IA32_VMX_BASIC,
12271230
MSR_IA32_VMX_TRUE_PINBASED_CTLS,
@@ -1240,13 +1243,14 @@ static u32 emulated_msrs[] = {
12401243
MSR_KVM_POLL_CONTROL,
12411244
};
12421245

1246+
static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
12431247
static unsigned num_emulated_msrs;
12441248

12451249
/*
12461250
* List of msr numbers which are used to expose MSR-based features that
12471251
* can be used by a hypervisor to validate requested CPU features.
12481252
*/
1249-
static u32 msr_based_features[] = {
1253+
static const u32 msr_based_features_all[] = {
12501254
MSR_IA32_VMX_BASIC,
12511255
MSR_IA32_VMX_TRUE_PINBASED_CTLS,
12521256
MSR_IA32_VMX_PINBASED_CTLS,
@@ -1271,6 +1275,7 @@ static u32 msr_based_features[] = {
12711275
MSR_IA32_ARCH_CAPABILITIES,
12721276
};
12731277

1278+
static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
12741279
static unsigned int num_msr_based_features;
12751280

12761281
static u64 kvm_get_arch_capabilities(void)
@@ -5118,22 +5123,22 @@ static void kvm_init_msr_list(void)
51185123
{
51195124
struct x86_pmu_capability x86_pmu;
51205125
u32 dummy[2];
5121-
unsigned i, j;
5126+
unsigned i;
51225127

51235128
BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
5124-
"Please update the fixed PMCs in msrs_to_save[]");
5129+
"Please update the fixed PMCs in msrs_to_saved_all[]");
51255130

51265131
perf_get_x86_pmu_capability(&x86_pmu);
51275132

5128-
for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
5129-
if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
5133+
for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
5134+
if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
51305135
continue;
51315136

51325137
/*
51335138
* Even MSRs that are valid in the host may not be exposed
51345139
* to the guests in some cases.
51355140
*/
5136-
switch (msrs_to_save[i]) {
5141+
switch (msrs_to_save_all[i]) {
51375142
case MSR_IA32_BNDCFGS:
51385143
if (!kvm_mpx_supported())
51395144
continue;
@@ -5161,52 +5166,43 @@ static void kvm_init_msr_list(void)
51615166
break;
51625167
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
51635168
if (!kvm_x86_ops->pt_supported() ||
5164-
msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >=
5169+
msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
51655170
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
51665171
continue;
51675172
break;
51685173
case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
5169-
if (msrs_to_save[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
5174+
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
51705175
min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
51715176
continue;
51725177
break;
51735178
case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
5174-
if (msrs_to_save[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
5179+
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
51755180
min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
51765181
continue;
51775182
}
51785183
default:
51795184
break;
51805185
}
51815186

5182-
if (j < i)
5183-
msrs_to_save[j] = msrs_to_save[i];
5184-
j++;
5187+
msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
51855188
}
5186-
num_msrs_to_save = j;
51875189

5188-
for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
5189-
if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
5190+
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
5191+
if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i]))
51905192
continue;
51915193

5192-
if (j < i)
5193-
emulated_msrs[j] = emulated_msrs[i];
5194-
j++;
5194+
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
51955195
}
5196-
num_emulated_msrs = j;
51975196

5198-
for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
5197+
for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
51995198
struct kvm_msr_entry msr;
52005199

5201-
msr.index = msr_based_features[i];
5200+
msr.index = msr_based_features_all[i];
52025201
if (kvm_get_msr_feature(&msr))
52035202
continue;
52045203

5205-
if (j < i)
5206-
msr_based_features[j] = msr_based_features[i];
5207-
j++;
5204+
msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
52085205
}
5209-
num_msr_based_features = j;
52105206
}
52115207

52125208
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,

include/linux/kvm_host.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -966,6 +966,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
966966
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
967967

968968
bool kvm_is_reserved_pfn(kvm_pfn_t pfn);
969+
bool kvm_is_zone_device_pfn(kvm_pfn_t pfn);
969970

970971
struct kvm_irq_ack_notifier {
971972
struct hlist_node link;

virt/kvm/kvm_main.c

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,30 @@ __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
150150
return 0;
151151
}
152152

153+
bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
154+
{
155+
/*
156+
* The metadata used by is_zone_device_page() to determine whether or
157+
* not a page is ZONE_DEVICE is guaranteed to be valid if and only if
158+
* the device has been pinned, e.g. by get_user_pages(). WARN if the
159+
* page_count() is zero to help detect bad usage of this helper.
160+
*/
161+
if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
162+
return false;
163+
164+
return is_zone_device_page(pfn_to_page(pfn));
165+
}
166+
153167
bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
154168
{
169+
/*
170+
* ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
171+
* perspective they are "normal" pages, albeit with slightly different
172+
* usage rules.
173+
*/
155174
if (pfn_valid(pfn))
156-
return PageReserved(pfn_to_page(pfn));
175+
return PageReserved(pfn_to_page(pfn)) &&
176+
!kvm_is_zone_device_pfn(pfn);
157177

158178
return true;
159179
}
@@ -663,6 +683,12 @@ static struct kvm *kvm_create_vm(unsigned long type)
663683

664684
BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
665685

686+
if (init_srcu_struct(&kvm->srcu))
687+
goto out_err_no_srcu;
688+
if (init_srcu_struct(&kvm->irq_srcu))
689+
goto out_err_no_irq_srcu;
690+
691+
refcount_set(&kvm->users_count, 1);
666692
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
667693
struct kvm_memslots *slots = kvm_alloc_memslots();
668694

@@ -680,7 +706,6 @@ static struct kvm *kvm_create_vm(unsigned long type)
680706
goto out_err_no_arch_destroy_vm;
681707
}
682708

683-
refcount_set(&kvm->users_count, 1);
684709
r = kvm_arch_init_vm(kvm, type);
685710
if (r)
686711
goto out_err_no_arch_destroy_vm;
@@ -693,11 +718,6 @@ static struct kvm *kvm_create_vm(unsigned long type)
693718
INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
694719
#endif
695720

696-
if (init_srcu_struct(&kvm->srcu))
697-
goto out_err_no_srcu;
698-
if (init_srcu_struct(&kvm->irq_srcu))
699-
goto out_err_no_irq_srcu;
700-
701721
r = kvm_init_mmu_notifier(kvm);
702722
if (r)
703723
goto out_err_no_mmu_notifier;
@@ -720,19 +740,19 @@ static struct kvm *kvm_create_vm(unsigned long type)
720740
mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
721741
#endif
722742
out_err_no_mmu_notifier:
723-
cleanup_srcu_struct(&kvm->irq_srcu);
724-
out_err_no_irq_srcu:
725-
cleanup_srcu_struct(&kvm->srcu);
726-
out_err_no_srcu:
727743
hardware_disable_all();
728744
out_err_no_disable:
729745
kvm_arch_destroy_vm(kvm);
730-
WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
731746
out_err_no_arch_destroy_vm:
747+
WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
732748
for (i = 0; i < KVM_NR_BUSES; i++)
733749
kfree(kvm_get_bus(kvm, i));
734750
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
735751
kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
752+
cleanup_srcu_struct(&kvm->irq_srcu);
753+
out_err_no_irq_srcu:
754+
cleanup_srcu_struct(&kvm->srcu);
755+
out_err_no_srcu:
736756
kvm_arch_free_vm(kvm);
737757
mmdrop(current->mm);
738758
return ERR_PTR(r);
@@ -1886,7 +1906,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
18861906

18871907
void kvm_set_pfn_dirty(kvm_pfn_t pfn)
18881908
{
1889-
if (!kvm_is_reserved_pfn(pfn)) {
1909+
if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
18901910
struct page *page = pfn_to_page(pfn);
18911911

18921912
SetPageDirty(page);
@@ -1896,7 +1916,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
18961916

18971917
void kvm_set_pfn_accessed(kvm_pfn_t pfn)
18981918
{
1899-
if (!kvm_is_reserved_pfn(pfn))
1919+
if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
19001920
mark_page_accessed(pfn_to_page(pfn));
19011921
}
19021922
EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);

0 commit comments

Comments
 (0)