fix(snapshot/x86_64): make sure TSC_DEADLINE MSR is non-zero

kalyazin · ShadowCurse · commit 94b37cb89b05 · 2024-05-31T12:56:58.000+01:00
On x86_64, we observed that when restoring from a snapshot,
one of the vCPUs had MSR_IA32_TSC_DEADLINE cleared and never
received TSC interrupts until the MSR is updated externally
(eg by setting the system time).

We believe this happens because the TSC interrupt is lost
during snapshot taking process: the MSR is cleared, but the
interrupt is not delivered to the guest, so the guest
does not rearm the timer.

A visible effect of that is failure to connect to a restored VM
via SSH.

This commit introduces a workaround. If when taking a snapshot,
we see a zero MSR_IA32_TSC_DEADLINE, we replace its value with
the MSR_IA32_TSC value from the same vCPU to make sure that
the vCPU will continue to receive TSC interrupts.

Signed-off-by: Nikita Kalyazin &lt;kalyazin@amazon.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -84,6 +84,12 @@ and this project adheres to
   UFFD support not being forward-compatible with new ioctl options introduced in
   Linux 6.6. See also
   https://github.com/bytecodealliance/userfaultfd-rs/issues/61.
+- [#4618](https://github.com/firecracker-microvm/firecracker/pull/4618): On
+  x86_64, when taking a snapshot, if a vCPU has MSR_IA32_TSC_DEADLINE set to 0,
+  Firecracker will replace it with the MSR_IA32_TSC value from the same vCPU.
+  This is to guarantee that the vCPU will continue receiving TSC interrupts
+  after restoring from the snapshot even if an interrupt is lost when taking a
+  snapshot.
 
 ## \[1.7.0\]
 
diff --git a/docs/snapshotting/snapshot-support.md b/docs/snapshotting/snapshot-support.md
@@ -171,6 +171,11 @@ The snapshot functionality is still in developer preview due to the following:
   the data store is not persisted across snapshots.
 - Configuration information for metrics and logs are not saved to the snapshot.
   These need to be reconfigured on the restored microVM.
+- On x86_64, if a vCPU has MSR_IA32_TSC_DEADLINE set to 0 when a snapshot is
+  taken, Firecracker replaces it with the MSR_IA32_TSC value from the same vCPU.
+  This is to guarantee that the vCPU will continue receiving TSC interrupts
+  after restoring from the snapshot even if an interrupt is lost when taking a
+  snapshot.
 
 ## Snapshot versioning
 
diff --git a/src/vmm/src/vstate/vcpu/x86_64.rs b/src/vmm/src/vstate/vcpu/x86_64.rs
@@ -19,6 +19,7 @@ use serde::{Deserialize, Serialize};
 use crate::arch::x86_64::interrupts;
 use crate::arch::x86_64::msr::{create_boot_msr_entries, MsrError};
 use crate::arch::x86_64::regs::{SetupFpuError, SetupRegistersError, SetupSpecialRegistersError};
+use crate::arch_gen::x86::msr_index::{MSR_IA32_TSC, MSR_IA32_TSC_DEADLINE};
 use crate::cpu_config::x86_64::{cpuid, CpuConfiguration};
 use crate::logger::{IncMetric, METRICS};
 use crate::vstate::memory::{Address, GuestAddress, GuestMemoryMmap};
@@ -282,6 +283,39 @@ impl KvmVcpu {
         Ok(cpuid)
     }
 
+    /// If the IA32_TSC_DEADLINE MSR value is zero, update it
+    /// with the IA32_TSC value to guarantee that
+    /// the vCPU will continue receiving interrupts after restoring from a snapshot.
+    ///
+    /// Rationale: we observed that sometimes when taking a snapshot,
+    /// the IA32_TSC_DEADLINE MSR is cleared, but the interrupt is not
+    /// delivered to the guest, leading to a situation where one
+    /// of the vCPUs never receives TSC interrupts after restoring,
+    /// until the MSR is updated externally, eg by setting the system time.
+    fn fix_zero_tsc_deadline_msr(msr_chunks: &mut [Msrs]) {
+        // We do not expect more than 1 TSC MSR entry, but if there are multiple, pick the maximum.
+        let max_tsc_value = msr_chunks
+            .iter()
+            .flat_map(|msrs| msrs.as_slice())
+            .filter(|msr| msr.index == MSR_IA32_TSC)
+            .map(|msr| msr.data)
+            .max();
+
+        if let Some(tsc_value) = max_tsc_value {
+            msr_chunks
+                .iter_mut()
+                .flat_map(|msrs| msrs.as_mut_slice())
+                .filter(|msr| msr.index == MSR_IA32_TSC_DEADLINE && msr.data == 0)
+                .for_each(|msr| {
+                    warn!(
+                        "MSR_IA32_TSC_DEADLINE is 0, replacing with {:x}.",
+                        tsc_value
+                    );
+                    msr.data = tsc_value;
+                });
+        }
+    }
+
     /// Get MSR chunks for the given MSR index list.
     ///
     /// KVM only supports getting `KVM_MAX_MSR_ENTRIES` at a time, so we divide
@@ -321,6 +355,8 @@ impl KvmVcpu {
             msr_chunks.push(msrs);
         }
 
+        Self::fix_zero_tsc_deadline_msr(&mut msr_chunks);
+
         Ok(msr_chunks)
     }
 
@@ -594,6 +630,7 @@ mod tests {
 
     use std::os::unix::io::AsRawFd;
 
+    use kvm_bindings::kvm_msr_entry;
     use kvm_ioctls::Cap;
 
     use super::*;
@@ -949,4 +986,77 @@ mod tests {
             }
         }
     }
+
+    fn msrs_from_entries(msr_entries: &[(u32, u64)]) -> Msrs {
+        Msrs::from_entries(
+            &msr_entries
+                .iter()
+                .map(|&(index, data)| kvm_msr_entry {
+                    index,
+                    data,
+                    ..Default::default()
+                })
+                .collect::<Vec<_>>(),
+        )
+        .unwrap()
+    }
+
+    fn assert_msrs(msr_chunks: &[Msrs], expected_msr_entries: &[(u32, u64)]) {
+        let flattened_msrs = msr_chunks.iter().flat_map(|msrs| msrs.as_slice());
+        for (a, b) in flattened_msrs.zip(expected_msr_entries.iter()) {
+            assert_eq!(a.index, b.0);
+            assert_eq!(a.data, b.1);
+        }
+    }
+
+    #[test]
+    fn test_fix_zero_tsc_deadline_msr_zero_same_chunk() {
+        // Place both TSC and TSC_DEADLINE MSRs in the same chunk.
+        let mut msr_chunks = [msrs_from_entries(&[
+            (MSR_IA32_TSC_DEADLINE, 0),
+            (MSR_IA32_TSC, 42),
+        ])];
+
+        KvmVcpu::fix_zero_tsc_deadline_msr(&mut msr_chunks);
+
+        // We expect for the MSR_IA32_TSC_DEADLINE to get updated with the MSR_IA32_TSC value.
+        assert_msrs(
+            &msr_chunks,
+            &[(MSR_IA32_TSC_DEADLINE, 42), (MSR_IA32_TSC, 42)],
+        );
+    }
+
+    #[test]
+    fn test_fix_zero_tsc_deadline_msr_zero_separate_chunks() {
+        // Place both TSC and TSC_DEADLINE MSRs in separate chunks.
+        let mut msr_chunks = [
+            msrs_from_entries(&[(MSR_IA32_TSC_DEADLINE, 0)]),
+            msrs_from_entries(&[(MSR_IA32_TSC, 42)]),
+        ];
+
+        KvmVcpu::fix_zero_tsc_deadline_msr(&mut msr_chunks);
+
+        // We expect for the MSR_IA32_TSC_DEADLINE to get updated with the MSR_IA32_TSC value.
+        assert_msrs(
+            &msr_chunks,
+            &[(MSR_IA32_TSC_DEADLINE, 42), (MSR_IA32_TSC, 42)],
+        );
+    }
+
+    #[test]
+    fn test_fix_zero_tsc_deadline_msr_non_zero() {
+        let mut msr_chunks = [msrs_from_entries(&[
+            (MSR_IA32_TSC_DEADLINE, 1),
+            (MSR_IA32_TSC, 2),
+        ])];
+
+        KvmVcpu::fix_zero_tsc_deadline_msr(&mut msr_chunks);
+
+        // We expect that MSR_IA32_TSC_DEADLINE should remain unchanged, because it is non-zero
+        // already.
+        assert_msrs(
+            &msr_chunks,
+            &[(MSR_IA32_TSC_DEADLINE, 1), (MSR_IA32_TSC, 2)],
+        );
+    }
 }