Skip to content

Commit 94b37cb

Browse files
kalyazinShadowCurse
authored andcommitted
fix(snapshot/x86_64): make sure TSC_DEADLINE MSR is non-zero
On x86_64, we observed that when restoring from a snapshot, one of the vCPUs had MSR_IA32_TSC_DEADLINE cleared and never received TSC interrupts until the MSR is updated externally (eg by setting the system time). We believe this happens because the TSC interrupt is lost during snapshot taking process: the MSR is cleared, but the interrupt is not delivered to the guest, so the guest does not rearm the timer. A visible effect of that is failure to connect to a restored VM via SSH. This commit introduces a workaround. If when taking a snapshot, we see a zero MSR_IA32_TSC_DEADLINE, we replace its value with the MSR_IA32_TSC value from the same vCPU to make sure that the vCPU will continue to receive TSC interrupts. Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent 3ce507f commit 94b37cb

File tree

3 files changed

+121
-0
lines changed

3 files changed

+121
-0
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ and this project adheres to
8484
UFFD support not being forward-compatible with new ioctl options introduced in
8585
Linux 6.6. See also
8686
https://github.com/bytecodealliance/userfaultfd-rs/issues/61.
87+
- [#4618](https://github.com/firecracker-microvm/firecracker/pull/4618): On
88+
x86_64, when taking a snapshot, if a vCPU has MSR_IA32_TSC_DEADLINE set to 0,
89+
Firecracker will replace it with the MSR_IA32_TSC value from the same vCPU.
90+
This is to guarantee that the vCPU will continue receiving TSC interrupts
91+
after restoring from the snapshot even if an interrupt is lost when taking a
92+
snapshot.
8793

8894
## \[1.7.0\]
8995

docs/snapshotting/snapshot-support.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,11 @@ The snapshot functionality is still in developer preview due to the following:
171171
the data store is not persisted across snapshots.
172172
- Configuration information for metrics and logs are not saved to the snapshot.
173173
These need to be reconfigured on the restored microVM.
174+
- On x86_64, if a vCPU has MSR_IA32_TSC_DEADLINE set to 0 when a snapshot is
175+
taken, Firecracker replaces it with the MSR_IA32_TSC value from the same vCPU.
176+
This is to guarantee that the vCPU will continue receiving TSC interrupts
177+
after restoring from the snapshot even if an interrupt is lost when taking a
178+
snapshot.
174179

175180
## Snapshot versioning
176181

src/vmm/src/vstate/vcpu/x86_64.rs

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use serde::{Deserialize, Serialize};
1919
use crate::arch::x86_64::interrupts;
2020
use crate::arch::x86_64::msr::{create_boot_msr_entries, MsrError};
2121
use crate::arch::x86_64::regs::{SetupFpuError, SetupRegistersError, SetupSpecialRegistersError};
22+
use crate::arch_gen::x86::msr_index::{MSR_IA32_TSC, MSR_IA32_TSC_DEADLINE};
2223
use crate::cpu_config::x86_64::{cpuid, CpuConfiguration};
2324
use crate::logger::{IncMetric, METRICS};
2425
use crate::vstate::memory::{Address, GuestAddress, GuestMemoryMmap};
@@ -282,6 +283,39 @@ impl KvmVcpu {
282283
Ok(cpuid)
283284
}
284285

286+
/// If the IA32_TSC_DEADLINE MSR value is zero, update it
287+
/// with the IA32_TSC value to guarantee that
288+
/// the vCPU will continue receiving interrupts after restoring from a snapshot.
289+
///
290+
/// Rationale: we observed that sometimes when taking a snapshot,
291+
/// the IA32_TSC_DEADLINE MSR is cleared, but the interrupt is not
292+
/// delivered to the guest, leading to a situation where one
293+
/// of the vCPUs never receives TSC interrupts after restoring,
294+
/// until the MSR is updated externally, eg by setting the system time.
295+
fn fix_zero_tsc_deadline_msr(msr_chunks: &mut [Msrs]) {
296+
// We do not expect more than 1 TSC MSR entry, but if there are multiple, pick the maximum.
297+
let max_tsc_value = msr_chunks
298+
.iter()
299+
.flat_map(|msrs| msrs.as_slice())
300+
.filter(|msr| msr.index == MSR_IA32_TSC)
301+
.map(|msr| msr.data)
302+
.max();
303+
304+
if let Some(tsc_value) = max_tsc_value {
305+
msr_chunks
306+
.iter_mut()
307+
.flat_map(|msrs| msrs.as_mut_slice())
308+
.filter(|msr| msr.index == MSR_IA32_TSC_DEADLINE && msr.data == 0)
309+
.for_each(|msr| {
310+
warn!(
311+
"MSR_IA32_TSC_DEADLINE is 0, replacing with {:x}.",
312+
tsc_value
313+
);
314+
msr.data = tsc_value;
315+
});
316+
}
317+
}
318+
285319
/// Get MSR chunks for the given MSR index list.
286320
///
287321
/// KVM only supports getting `KVM_MAX_MSR_ENTRIES` at a time, so we divide
@@ -321,6 +355,8 @@ impl KvmVcpu {
321355
msr_chunks.push(msrs);
322356
}
323357

358+
Self::fix_zero_tsc_deadline_msr(&mut msr_chunks);
359+
324360
Ok(msr_chunks)
325361
}
326362

@@ -594,6 +630,7 @@ mod tests {
594630

595631
use std::os::unix::io::AsRawFd;
596632

633+
use kvm_bindings::kvm_msr_entry;
597634
use kvm_ioctls::Cap;
598635

599636
use super::*;
@@ -949,4 +986,77 @@ mod tests {
949986
}
950987
}
951988
}
989+
990+
fn msrs_from_entries(msr_entries: &[(u32, u64)]) -> Msrs {
991+
Msrs::from_entries(
992+
&msr_entries
993+
.iter()
994+
.map(|&(index, data)| kvm_msr_entry {
995+
index,
996+
data,
997+
..Default::default()
998+
})
999+
.collect::<Vec<_>>(),
1000+
)
1001+
.unwrap()
1002+
}
1003+
1004+
fn assert_msrs(msr_chunks: &[Msrs], expected_msr_entries: &[(u32, u64)]) {
1005+
let flattened_msrs = msr_chunks.iter().flat_map(|msrs| msrs.as_slice());
1006+
for (a, b) in flattened_msrs.zip(expected_msr_entries.iter()) {
1007+
assert_eq!(a.index, b.0);
1008+
assert_eq!(a.data, b.1);
1009+
}
1010+
}
1011+
1012+
#[test]
1013+
fn test_fix_zero_tsc_deadline_msr_zero_same_chunk() {
1014+
// Place both TSC and TSC_DEADLINE MSRs in the same chunk.
1015+
let mut msr_chunks = [msrs_from_entries(&[
1016+
(MSR_IA32_TSC_DEADLINE, 0),
1017+
(MSR_IA32_TSC, 42),
1018+
])];
1019+
1020+
KvmVcpu::fix_zero_tsc_deadline_msr(&mut msr_chunks);
1021+
1022+
// We expect for the MSR_IA32_TSC_DEADLINE to get updated with the MSR_IA32_TSC value.
1023+
assert_msrs(
1024+
&msr_chunks,
1025+
&[(MSR_IA32_TSC_DEADLINE, 42), (MSR_IA32_TSC, 42)],
1026+
);
1027+
}
1028+
1029+
#[test]
1030+
fn test_fix_zero_tsc_deadline_msr_zero_separate_chunks() {
1031+
// Place both TSC and TSC_DEADLINE MSRs in separate chunks.
1032+
let mut msr_chunks = [
1033+
msrs_from_entries(&[(MSR_IA32_TSC_DEADLINE, 0)]),
1034+
msrs_from_entries(&[(MSR_IA32_TSC, 42)]),
1035+
];
1036+
1037+
KvmVcpu::fix_zero_tsc_deadline_msr(&mut msr_chunks);
1038+
1039+
// We expect for the MSR_IA32_TSC_DEADLINE to get updated with the MSR_IA32_TSC value.
1040+
assert_msrs(
1041+
&msr_chunks,
1042+
&[(MSR_IA32_TSC_DEADLINE, 42), (MSR_IA32_TSC, 42)],
1043+
);
1044+
}
1045+
1046+
#[test]
1047+
fn test_fix_zero_tsc_deadline_msr_non_zero() {
1048+
let mut msr_chunks = [msrs_from_entries(&[
1049+
(MSR_IA32_TSC_DEADLINE, 1),
1050+
(MSR_IA32_TSC, 2),
1051+
])];
1052+
1053+
KvmVcpu::fix_zero_tsc_deadline_msr(&mut msr_chunks);
1054+
1055+
// We expect that MSR_IA32_TSC_DEADLINE should remain unchanged, because it is non-zero
1056+
// already.
1057+
assert_msrs(
1058+
&msr_chunks,
1059+
&[(MSR_IA32_TSC_DEADLINE, 1), (MSR_IA32_TSC, 2)],
1060+
);
1061+
}
9521062
}

0 commit comments

Comments
 (0)