Skip to content

Commit 5d6d6a7

Browse files
dwmw2sean-jc
authored andcommitted
KVM: x86: Refine calculation of guest wall clock to use a single TSC read
When populating the guest's PV wall clock information, KVM currently does a simple 'kvm_get_real_ns() - get_kvmclock_ns(kvm)'. This is an antipattern which should be avoided; when working with the relationship between two clocks, it's never correct to obtain one of them "now" and then the other at a slightly different "now" after an unspecified period of preemption (which might not even be under the control of the kernel, if this is an L1 hosting an L2 guest under nested virtualization). Add a kvm_get_wall_clock_epoch() function to return the guest wall clock epoch in nanoseconds using the same method as __get_kvmclock() — by using kvm_get_walltime_and_clockread() to calculate both the wall clock and KVM clock time from a *single* TSC reading. The condition using get_cpu_tsc_khz() is equivalent to the version in __get_kvmclock() which separately checks for the CONSTANT_TSC feature or the per-CPU cpu_tsc_khz. Which is what get_cpu_tsc_khz() does anyway. Signed-off-by: David Woodhouse <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Sean Christopherson <[email protected]>
1 parent e47d860 commit 5d6d6a7

File tree

3 files changed

+81
-9
lines changed

3 files changed

+81
-9
lines changed

arch/x86/kvm/x86.c

Lines changed: 78 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2331,14 +2331,9 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_o
23312331
if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
23322332
return;
23332333

2334-
/*
2335-
* The guest calculates current wall clock time by adding
2336-
* system time (updated by kvm_guest_time_update below) to the
2337-
* wall clock specified here. We do the reverse here.
2338-
*/
2339-
wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
2334+
wall_nsec = kvm_get_wall_clock_epoch(kvm);
23402335

2341-
wc.nsec = do_div(wall_nsec, 1000000000);
2336+
wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
23422337
wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
23432338
wc.version = version;
23442339

@@ -3241,6 +3236,82 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
32413236
return 0;
32423237
}
32433238

3239+
/*
3240+
* The pvclock_wall_clock ABI tells the guest the wall clock time at
3241+
* which it started (i.e. its epoch, when its kvmclock was zero).
3242+
*
3243+
* In fact those clocks are subtly different; wall clock frequency is
3244+
* adjusted by NTP and has leap seconds, while the kvmclock is a
3245+
* simple function of the TSC without any such adjustment.
3246+
*
3247+
* Perhaps the ABI should have exposed CLOCK_TAI and a ratio between
3248+
* that and kvmclock, but even that would be subject to change over
3249+
* time.
3250+
*
3251+
* Attempt to calculate the epoch at a given moment using the *same*
3252+
* TSC reading via kvm_get_walltime_and_clockread() to obtain both
3253+
* wallclock and kvmclock times, and subtracting one from the other.
3254+
*
3255+
* Fall back to using their values at slightly different moments by
3256+
* calling ktime_get_real_ns() and get_kvmclock_ns() separately.
3257+
*/
3258+
uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
3259+
{
3260+
#ifdef CONFIG_X86_64
3261+
struct pvclock_vcpu_time_info hv_clock;
3262+
struct kvm_arch *ka = &kvm->arch;
3263+
unsigned long seq, local_tsc_khz;
3264+
struct timespec64 ts;
3265+
uint64_t host_tsc;
3266+
3267+
do {
3268+
seq = read_seqcount_begin(&ka->pvclock_sc);
3269+
3270+
local_tsc_khz = 0;
3271+
if (!ka->use_master_clock)
3272+
break;
3273+
3274+
/*
3275+
* The TSC read and the call to get_cpu_tsc_khz() must happen
3276+
* on the same CPU.
3277+
*/
3278+
get_cpu();
3279+
3280+
local_tsc_khz = get_cpu_tsc_khz();
3281+
3282+
if (local_tsc_khz &&
3283+
!kvm_get_walltime_and_clockread(&ts, &host_tsc))
3284+
local_tsc_khz = 0; /* Fall back to old method */
3285+
3286+
put_cpu();
3287+
3288+
/*
3289+
* These values must be snapshotted within the seqcount loop.
3290+
* After that, it's just mathematics which can happen on any
3291+
* CPU at any time.
3292+
*/
3293+
hv_clock.tsc_timestamp = ka->master_cycle_now;
3294+
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
3295+
3296+
} while (read_seqcount_retry(&ka->pvclock_sc, seq));
3297+
3298+
/*
3299+
* If the conditions were right, and obtaining the wallclock+TSC was
3300+
* successful, calculate the KVM clock at the corresponding time and
3301+
* subtract one from the other to get the guest's epoch in nanoseconds
3302+
* since 1970-01-01.
3303+
*/
3304+
if (local_tsc_khz) {
3305+
kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC,
3306+
&hv_clock.tsc_shift,
3307+
&hv_clock.tsc_to_system_mul);
3308+
return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
3309+
__pvclock_read_cycles(&hv_clock, host_tsc);
3310+
}
3311+
#endif
3312+
return ktime_get_real_ns() - get_kvmclock_ns(kvm);
3313+
}
3314+
32443315
/*
32453316
* kvmclock updates which are isolated to a given vcpu, such as
32463317
* vcpu->cpu migration, should not allow system_timestamp from

arch/x86/kvm/x86.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
293293
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
294294

295295
u64 get_kvmclock_ns(struct kvm *kvm);
296+
uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
296297

297298
int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
298299
gva_t addr, void *val, unsigned int bytes,

arch/x86/kvm/xen.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
5959
* This code mirrors kvm_write_wall_clock() except that it writes
6060
* directly through the pfn cache and doesn't mark the page dirty.
6161
*/
62-
wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
62+
wall_nsec = kvm_get_wall_clock_epoch(kvm);
6363

6464
/* It could be invalid again already, so we need to check */
6565
read_lock_irq(&gpc->lock);
@@ -98,7 +98,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
9898
wc_version = wc->version = (wc->version + 1) | 1;
9999
smp_wmb();
100100

101-
wc->nsec = do_div(wall_nsec, 1000000000);
101+
wc->nsec = do_div(wall_nsec, NSEC_PER_SEC);
102102
wc->sec = (u32)wall_nsec;
103103
*wc_sec_hi = wall_nsec >> 32;
104104
smp_wmb();

0 commit comments

Comments
 (0)