Skip to content

Commit 7e2175e

Browse files
dwmw2bonzini
authored andcommitted
KVM: x86: Fix recording of guest steal time / preempted status
In commit b043138 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed") we switched to using a gfn_to_pfn_cache for accessing the guest steal time structure in order to allow for an atomic xchg of the preempted field. This has a couple of problems. Firstly, kvm_map_gfn() doesn't work at all for IOMEM pages when the atomic flag is set, which it is in kvm_steal_time_set_preempted(). So a guest vCPU using an IOMEM page for its steal time would never have its preempted field set. Secondly, the gfn_to_pfn_cache is not invalidated in all cases where it should have been. There are two stages to the GFN->PFN conversion; first the GFN is converted to a userspace HVA, and then that HVA is looked up in the process page tables to find the underlying host PFN. Correct invalidation of the latter would require being hooked up to the MMU notifiers, but that doesn't happen---so it just keeps mapping and unmapping the *wrong* PFN after the userspace page tables change. In the !IOMEM case at least the stale page *is* pinned all the time it's cached, so it won't be freed and reused by anyone else while still receiving the steal time updates. The map/unmap dance only takes care of the KVM administrivia such as marking the page dirty. Until the gfn_to_pfn cache handles the remapping automatically by integrating with the MMU notifiers, we might as well not get a kernel mapping of it, and use the perfectly serviceable userspace HVA that we already have. We just need to implement the atomic xchg on the userspace address with appropriate exception handling, which is fairly trivial. Cc: [email protected] Fixes: b043138 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed") Signed-off-by: David Woodhouse <[email protected]> Message-Id: <[email protected]> [I didn't entirely agree with David's assessment of the usefulness of the gfn_to_pfn cache, and integrated the outcome of the discussion in the above commit message. - Paolo] Signed-off-by: Paolo Bonzini <[email protected]>
1 parent 52cf891 commit 7e2175e

File tree

2 files changed

+76
-31
lines changed

2 files changed

+76
-31
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -748,7 +748,7 @@ struct kvm_vcpu_arch {
748748
u8 preempted;
749749
u64 msr_val;
750750
u64 last_steal;
751-
struct gfn_to_pfn_cache cache;
751+
struct gfn_to_hva_cache cache;
752752
} st;
753753

754754
u64 l1_tsc_offset;

arch/x86/kvm/x86.c

Lines changed: 75 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3260,8 +3260,11 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
32603260

32613261
static void record_steal_time(struct kvm_vcpu *vcpu)
32623262
{
3263-
struct kvm_host_map map;
3264-
struct kvm_steal_time *st;
3263+
struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
3264+
struct kvm_steal_time __user *st;
3265+
struct kvm_memslots *slots;
3266+
u64 steal;
3267+
u32 version;
32653268

32663269
if (kvm_xen_msr_enabled(vcpu->kvm)) {
32673270
kvm_xen_runstate_set_running(vcpu);
@@ -3271,47 +3274,83 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
32713274
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
32723275
return;
32733276

3274-
/* -EAGAIN is returned in atomic context so we can just return. */
3275-
if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
3276-
&map, &vcpu->arch.st.cache, false))
3277+
if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
32773278
return;
32783279

3279-
st = map.hva +
3280-
offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
3280+
slots = kvm_memslots(vcpu->kvm);
3281+
3282+
if (unlikely(slots->generation != ghc->generation ||
3283+
kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
3284+
gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
3285+
3286+
/* We rely on the fact that it fits in a single page. */
3287+
BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
3288+
3289+
if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) ||
3290+
kvm_is_error_hva(ghc->hva) || !ghc->memslot)
3291+
return;
3292+
}
3293+
3294+
st = (struct kvm_steal_time __user *)ghc->hva;
3295+
if (!user_access_begin(st, sizeof(*st)))
3296+
return;
32813297

32823298
/*
32833299
* Doing a TLB flush here, on the guest's behalf, can avoid
32843300
* expensive IPIs.
32853301
*/
32863302
if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
3287-
u8 st_preempted = xchg(&st->preempted, 0);
3303+
u8 st_preempted = 0;
3304+
int err = -EFAULT;
3305+
3306+
asm volatile("1: xchgb %0, %2\n"
3307+
"xor %1, %1\n"
3308+
"2:\n"
3309+
_ASM_EXTABLE_UA(1b, 2b)
3310+
: "+r" (st_preempted),
3311+
"+&r" (err)
3312+
: "m" (st->preempted));
3313+
if (err)
3314+
goto out;
3315+
3316+
user_access_end();
3317+
3318+
vcpu->arch.st.preempted = 0;
32883319

32893320
trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
32903321
st_preempted & KVM_VCPU_FLUSH_TLB);
32913322
if (st_preempted & KVM_VCPU_FLUSH_TLB)
32923323
kvm_vcpu_flush_tlb_guest(vcpu);
3324+
3325+
if (!user_access_begin(st, sizeof(*st)))
3326+
goto dirty;
32933327
} else {
3294-
st->preempted = 0;
3328+
unsafe_put_user(0, &st->preempted, out);
3329+
vcpu->arch.st.preempted = 0;
32953330
}
32963331

3297-
vcpu->arch.st.preempted = 0;
3298-
3299-
if (st->version & 1)
3300-
st->version += 1; /* first time write, random junk */
3332+
unsafe_get_user(version, &st->version, out);
3333+
if (version & 1)
3334+
version += 1; /* first time write, random junk */
33013335

3302-
st->version += 1;
3336+
version += 1;
3337+
unsafe_put_user(version, &st->version, out);
33033338

33043339
smp_wmb();
33053340

3306-
st->steal += current->sched_info.run_delay -
3341+
unsafe_get_user(steal, &st->steal, out);
3342+
steal += current->sched_info.run_delay -
33073343
vcpu->arch.st.last_steal;
33083344
vcpu->arch.st.last_steal = current->sched_info.run_delay;
3345+
unsafe_put_user(steal, &st->steal, out);
33093346

3310-
smp_wmb();
3311-
3312-
st->version += 1;
3347+
version += 1;
3348+
unsafe_put_user(version, &st->version, out);
33133349

3314-
kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
3350+
out:
3351+
user_access_end();
3352+
dirty:
3353+
mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
33153354
}
33163355

33173356
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -4351,25 +4390,34 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
43514390

43524391
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
43534392
{
4354-
struct kvm_host_map map;
4355-
struct kvm_steal_time *st;
4393+
struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
4394+
struct kvm_steal_time __user *st;
4395+
struct kvm_memslots *slots;
4396+
static const u8 preempted = KVM_VCPU_PREEMPTED;
43564397

43574398
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
43584399
return;
43594400

43604401
if (vcpu->arch.st.preempted)
43614402
return;
43624403

4363-
if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
4364-
&vcpu->arch.st.cache, true))
4404+
/* This happens on process exit */
4405+
if (unlikely(current->mm != vcpu->kvm->mm))
43654406
return;
43664407

4367-
st = map.hva +
4368-
offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
4408+
slots = kvm_memslots(vcpu->kvm);
4409+
4410+
if (unlikely(slots->generation != ghc->generation ||
4411+
kvm_is_error_hva(ghc->hva) || !ghc->memslot))
4412+
return;
43694413

4370-
st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
4414+
st = (struct kvm_steal_time __user *)ghc->hva;
4415+
BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
43714416

4372-
kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
4417+
if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
4418+
vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
4419+
4420+
mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
43734421
}
43744422

43754423
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -11022,11 +11070,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
1102211070

1102311071
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
1102411072
{
11025-
struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
1102611073
int idx;
1102711074

11028-
kvm_release_pfn(cache->pfn, cache->dirty, cache);
11029-
1103011075
kvmclock_reset(vcpu);
1103111076

1103211077
static_call(kvm_x86_vcpu_free)(vcpu);

0 commit comments

Comments
 (0)