From 4bca17f772b56c2daf56783b0d60097d1b6dc5df Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Tue, 14 Jan 2025 17:32:43 +0000 Subject: [PATCH 1/5] refactor: move KVM related logic into a separate struct `Vm` constructor was the only place where the `/dev/kvm` was open and only there we could do any KVM (not VM) specific checks. By moving this KVM logic into a separate struct we can can do KVM specific actions (like checking optional KVM capabilities) without needing to reopen the `/dev/kvm` again. Signed-off-by: Egor Lazarchuk --- src/vmm/src/arch/aarch64/vcpu.rs | 18 +-- src/vmm/src/builder.rs | 38 +++-- src/vmm/src/device_manager/legacy.rs | 7 +- src/vmm/src/device_manager/mmio.rs | 10 +- src/vmm/src/lib.rs | 6 + src/vmm/src/persist.rs | 4 + src/vmm/src/vstate/kvm.rs | 205 +++++++++++++++++++++++++++ src/vmm/src/vstate/mod.rs | 2 + src/vmm/src/vstate/vcpu/aarch64.rs | 46 +++--- src/vmm/src/vstate/vcpu/mod.rs | 34 ++--- src/vmm/src/vstate/vcpu/x86_64.rs | 77 +++++----- src/vmm/src/vstate/vm.rs | 198 ++++---------------------- 12 files changed, 372 insertions(+), 273 deletions(-) create mode 100644 src/vmm/src/vstate/kvm.rs diff --git a/src/vmm/src/arch/aarch64/vcpu.rs b/src/vmm/src/arch/aarch64/vcpu.rs index 80fc5a339df..859e2da2cb6 100644 --- a/src/vmm/src/arch/aarch64/vcpu.rs +++ b/src/vmm/src/arch/aarch64/vcpu.rs @@ -214,16 +214,16 @@ pub fn set_mpstate(vcpufd: &VcpuFd, state: kvm_mp_state) -> Result<(), VcpuError #[cfg(test)] mod tests { #![allow(clippy::undocumented_unsafe_blocks)] - use kvm_ioctls::Kvm; use super::*; use crate::arch::aarch64::layout; use crate::test_utils::arch_mem; + use crate::vstate::kvm::Kvm; #[test] fn test_setup_regs() { - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = kvm.fd.create_vm().unwrap(); let vcpu = vm.create_vcpu(0).unwrap(); let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); @@ -242,8 +242,8 @@ mod tests { #[test] fn test_read_mpidr() { - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = kvm.fd.create_vm().unwrap(); let vcpu = vm.create_vcpu(0).unwrap(); let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); vm.get_preferred_target(&mut kvi).unwrap(); @@ -261,8 +261,8 @@ mod tests { #[test] fn test_get_set_regs() { - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = kvm.fd.create_vm().unwrap(); let vcpu = vm.create_vcpu(0).unwrap(); let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); vm.get_preferred_target(&mut kvi).unwrap(); @@ -283,8 +283,8 @@ mod tests { fn test_mpstate() { use std::os::unix::io::AsRawFd; - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = kvm.fd.create_vm().unwrap(); let vcpu = vm.create_vcpu(0).unwrap(); let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); vm.get_preferred_target(&mut kvi).unwrap(); diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 999e27415fc..6a59fe35a83 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -68,6 +68,7 @@ use crate::utils::u64_to_usize; use crate::vmm_config::boot_source::BootConfig; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::{MachineConfig, MachineConfigError}; +use crate::vstate::kvm::Kvm; use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap}; use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuError}; use crate::vstate::vm::Vm; @@ -160,11 +161,17 @@ fn create_vmm_and_vcpus( ) -> Result<(Vmm, Vec), StartMicrovmError> { use self::StartMicrovmError::*; + let kvm = Kvm::new(kvm_capabilities) + .map_err(VmmError::Kvm) + .map_err(StartMicrovmError::Internal)?; // Set up Kvm Vm and register memory regions. // Build custom CPU config if a custom template is provided. - let mut vm = Vm::new(kvm_capabilities) + let mut vm = Vm::new(&kvm) .map_err(VmmError::Vm) .map_err(StartMicrovmError::Internal)?; + kvm.check_memory(&guest_memory) + .map_err(VmmError::Kvm) + .map_err(StartMicrovmError::Internal)?; vm.memory_init(&guest_memory, track_dirty_pages) .map_err(VmmError::Vm) .map_err(StartMicrovmError::Internal)?; @@ -186,7 +193,7 @@ fn create_vmm_and_vcpus( #[cfg(target_arch = "x86_64")] let (vcpus, pio_device_manager) = { setup_interrupt_controller(&mut vm)?; - let vcpus = create_vcpus(&vm, vcpu_count, &vcpus_exit_evt).map_err(Internal)?; + let vcpus = create_vcpus(&kvm, &vm, vcpu_count, &vcpus_exit_evt).map_err(Internal)?; // Make stdout non blocking. set_stdout_nonblocking(); @@ -218,7 +225,7 @@ fn create_vmm_and_vcpus( // Search for `kvm_arch_vcpu_create` in arch/arm/kvm/arm.c. #[cfg(target_arch = "aarch64")] let vcpus = { - let vcpus = create_vcpus(&vm, vcpu_count, &vcpus_exit_evt).map_err(Internal)?; + let vcpus = create_vcpus(&kvm, &vm, vcpu_count, &vcpus_exit_evt).map_err(Internal)?; setup_interrupt_controller(&mut vm, vcpu_count)?; vcpus }; @@ -227,6 +234,7 @@ fn create_vmm_and_vcpus( events_observer: Some(std::io::stdin()), instance_info: instance_info.clone(), shutdown_exit_code: None, + kvm, vm, guest_memory, uffd, @@ -476,7 +484,7 @@ pub fn build_microvm_from_snapshot( uffd, vm_resources.machine_config.track_dirty_pages, vm_resources.machine_config.vcpu_count, - microvm_state.vm_state.kvm_cap_modifiers.clone(), + microvm_state.kvm_state.kvm_cap_modifiers.clone(), )?; #[cfg(target_arch = "x86_64")] @@ -738,11 +746,16 @@ fn attach_legacy_devices_aarch64( .map_err(VmmError::RegisterMMIODevice) } -fn create_vcpus(vm: &Vm, vcpu_count: u8, exit_evt: &EventFd) -> Result, VmmError> { +fn create_vcpus( + kvm: &Kvm, + vm: &Vm, + vcpu_count: u8, + exit_evt: &EventFd, +) -> Result, VmmError> { let mut vcpus = Vec::with_capacity(vcpu_count as usize); for cpu_idx in 0..vcpu_count { let exit_evt = exit_evt.try_clone().map_err(VmmError::EventFd)?; - let vcpu = Vcpu::new(cpu_idx, vm, exit_evt).map_err(VmmError::VcpuCreate)?; + let vcpu = Vcpu::new(cpu_idx, vm, kvm, exit_evt).map_err(VmmError::VcpuCreate)?; vcpus.push(vcpu); } Ok(vcpus) @@ -765,7 +778,7 @@ pub fn configure_system_for_boot( #[cfg(target_arch = "x86_64")] let cpu_config = { use crate::cpu_config::x86_64::cpuid; - let cpuid = cpuid::Cpuid::try_from(vmm.vm.supported_cpuid().clone()) + let cpuid = cpuid::Cpuid::try_from(vmm.kvm.supported_cpuid.clone()) .map_err(GuestConfigError::CpuidFromKvmCpuid)?; let msrs = vcpus[0] .kvm_vcpu @@ -1111,7 +1124,8 @@ pub(crate) mod tests { .map_err(StartMicrovmError::Internal) .unwrap(); - let mut vm = Vm::new(vec![]).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let mut vm = Vm::new(&kvm).unwrap(); vm.memory_init(&guest_memory, false).unwrap(); let mmio_device_manager = MMIODeviceManager::new(); let acpi_device_manager = ACPIDeviceManager::new(); @@ -1137,7 +1151,7 @@ pub(crate) mod tests { #[cfg(target_arch = "aarch64")] { let exit_evt = EventFd::new(libc::EFD_NONBLOCK).unwrap(); - let _vcpu = Vcpu::new(1, &vm, exit_evt).unwrap(); + let _vcpu = Vcpu::new(1, &vm, &kvm, exit_evt).unwrap(); setup_interrupt_controller(&mut vm, 1).unwrap(); } @@ -1145,6 +1159,7 @@ pub(crate) mod tests { events_observer: Some(std::io::stdin()), instance_info: InstanceInfo::default(), shutdown_exit_code: None, + kvm, vm, guest_memory, uffd: None, @@ -1362,15 +1377,16 @@ pub(crate) mod tests { let vcpu_count = 2; let guest_memory = arch_mem(128 << 20); + let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); #[allow(unused_mut)] - let mut vm = Vm::new(vec![]).unwrap(); + let mut vm = Vm::new(&kvm).unwrap(); vm.memory_init(&guest_memory, false).unwrap(); let evfd = EventFd::new(libc::EFD_NONBLOCK).unwrap(); #[cfg(target_arch = "x86_64")] setup_interrupt_controller(&mut vm).unwrap(); - let vcpu_vec = create_vcpus(&vm, vcpu_count, &evfd).unwrap(); + let vcpu_vec = create_vcpus(&kvm, &vm, vcpu_count, &evfd).unwrap(); assert_eq!(vcpu_vec.len(), vcpu_count as usize); } diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 45842d933b2..8526d3c2901 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -244,14 +244,11 @@ impl PortIODeviceManager { #[cfg(test)] mod tests { use super::*; - use crate::test_utils::single_region_mem; - use crate::Vm; + use crate::vstate::vm::tests::setup_vm_with_memory; #[test] fn test_register_legacy_devices() { - let guest_mem = single_region_mem(0x1000); - let mut vm = Vm::new(vec![]).unwrap(); - vm.memory_init(&guest_mem, false).unwrap(); + let (_, mut vm, _) = setup_vm_with_memory(0x1000); crate::builder::setup_interrupt_controller(&mut vm).unwrap(); let mut ldm = PortIODeviceManager::new( Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 00c155abcfd..635bc1bc6e0 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -544,6 +544,7 @@ mod tests { use crate::devices::virtio::queue::Queue; use crate::devices::virtio::ActivateError; use crate::test_utils::multi_region_mem; + use crate::vstate::kvm::Kvm; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; use crate::{builder, Vm}; @@ -661,7 +662,8 @@ mod tests { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); - let mut vm = Vm::new(vec![]).unwrap(); + let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); + let mut vm = Vm::new(&kvm).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -690,7 +692,8 @@ mod tests { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); - let mut vm = Vm::new(vec![]).unwrap(); + let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); + let mut vm = Vm::new(&kvm).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -744,7 +747,8 @@ mod tests { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); - let mut vm = Vm::new(vec![]).unwrap(); + let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); + let mut vm = Vm::new(&kvm).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); let mem_clone = guest_mem.clone(); diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 77c0018c55a..6833a3a12d2 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -127,6 +127,7 @@ use userfaultfd::Uffd; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::terminal::Terminal; +use vstate::kvm::Kvm; use vstate::vcpu::{self, KvmVcpuConfigureError, StartThreadedError, VcpuSendEventError}; use crate::arch::DeviceType; @@ -255,6 +256,8 @@ pub enum VmmError { VcpuSpawn(io::Error), /// Vm error: {0} Vm(vstate::vm::VmError), + /// Kvm error: {0} + Kvm(vstate::kvm::KvmError), /// Error thrown by observer object on Vmm initialization: {0} VmmObserverInit(vmm_sys_util::errno::Error), /// Error thrown by observer object on Vmm teardown: {0} @@ -307,6 +310,7 @@ pub struct Vmm { shutdown_exit_code: Option, // Guest VM core resources. + kvm: Kvm, vm: Vm, guest_memory: GuestMemoryMmap, // Save UFFD in order to keep it open in the Firecracker process, as well. @@ -511,6 +515,7 @@ impl Vmm { pub fn save_state(&mut self, vm_info: &VmInfo) -> Result { use self::MicrovmStateError::SaveVmState; let vcpu_states = self.save_vcpu_states()?; + let kvm_state = self.kvm.save_state(); let vm_state = { #[cfg(target_arch = "x86_64")] { @@ -531,6 +536,7 @@ impl Vmm { Ok(MicrovmState { vm_info: vm_info.clone(), memory_state, + kvm_state, vm_state, vcpu_states, device_states, diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 621d95d1e87..c9aadad10a9 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -36,6 +36,7 @@ use crate::vmm_config::machine_config::{HugePageConfig, MachineConfigError, Mach use crate::vmm_config::snapshot::{ CreateSnapshotParams, LoadSnapshotParams, MemBackendType, SnapshotType, }; +use crate::vstate::kvm::KvmState; use crate::vstate::memory::{ GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryState, MemoryError, }; @@ -77,6 +78,8 @@ pub struct MicrovmState { pub vm_info: VmInfo, /// Memory state. pub memory_state: GuestMemoryState, + /// KVM KVM state. + pub kvm_state: KvmState, /// VM KVM state. pub vm_state: VmState, /// Vcpu states. @@ -736,6 +739,7 @@ mod tests { device_states: states, memory_state, vcpu_states, + kvm_state: Default::default(), vm_info: VmInfo { mem_size_mib: 1u64, ..Default::default() diff --git a/src/vmm/src/vstate/kvm.rs b/src/vmm/src/vstate/kvm.rs new file mode 100644 index 00000000000..985a9fae1b3 --- /dev/null +++ b/src/vmm/src/vstate/kvm.rs @@ -0,0 +1,205 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::KVM_API_VERSION; +#[cfg(target_arch = "x86_64")] +use kvm_bindings::{CpuId, MsrList, KVM_MAX_CPUID_ENTRIES}; +use kvm_ioctls::Kvm as KvmFd; +use serde::{Deserialize, Serialize}; + +use crate::cpu_config::templates::KvmCapability; +use crate::vstate::memory::{GuestMemory, GuestMemoryMmap}; + +/// Errors associated with the wrappers over KVM ioctls. +/// Needs `rustfmt::skip` to make multiline comments work +#[rustfmt::skip] +#[derive(Debug, PartialEq, Eq, thiserror::Error, displaydoc::Display)] +pub enum KvmError { + /// The host kernel reports an invalid KVM API version: {0} + ApiVersion(i32), + /// Missing KVM capabilities: {0:#x?} + Capabilities(u32), + /** Error creating KVM object: {0} Make sure the user launching the firecracker process is \ + configured on the /dev/kvm file's ACL. */ + Kvm(kvm_ioctls::Error), + #[cfg(target_arch = "x86_64")] + /// Failed to get MSR index list to save into snapshots: {0} + GetMsrsToSave(crate::arch::x86_64::msr::MsrError), + #[cfg(target_arch = "x86_64")] + /// Failed to get supported cpuid: {0} + GetSupportedCpuId(kvm_ioctls::Error), + /// The number of configured slots is bigger than the maximum reported by KVM + NotEnoughMemorySlots, +} + +/// Struct with kvm fd and kvm associated paramenters. +#[derive(Debug)] +pub struct Kvm { + /// KVM fd. + pub fd: KvmFd, + /// Maximum number of memory slots allowed by KVM. + pub max_memslots: usize, + /// Additional capabilities that were specified in cpu template. + pub kvm_cap_modifiers: Vec, + + #[cfg(target_arch = "x86_64")] + /// Supported CpuIds. + pub supported_cpuid: CpuId, + #[cfg(target_arch = "x86_64")] + /// Msrs needed to be saved on snapshot creation. + pub msrs_to_save: MsrList, +} + +impl Kvm { + /// Create `Kvm` struct. + pub fn new(kvm_cap_modifiers: Vec) -> Result { + let kvm_fd = KvmFd::new().map_err(KvmError::Kvm)?; + + // Check that KVM has the correct version. + // Safe to cast because this is a constant. + #[allow(clippy::cast_possible_wrap)] + if kvm_fd.get_api_version() != KVM_API_VERSION as i32 { + return Err(KvmError::ApiVersion(kvm_fd.get_api_version())); + } + + let total_caps = Self::combine_capabilities(&kvm_cap_modifiers); + // Check that all desired capabilities are supported. + Self::check_capabilities(&kvm_fd, &total_caps).map_err(KvmError::Capabilities)?; + + let max_memslots = kvm_fd.get_nr_memslots(); + + #[cfg(target_arch = "aarch64")] + { + Ok(Self { + fd: kvm_fd, + max_memslots, + kvm_cap_modifiers, + }) + } + + #[cfg(target_arch = "x86_64")] + { + let supported_cpuid = kvm_fd + .get_supported_cpuid(KVM_MAX_CPUID_ENTRIES) + .map_err(KvmError::GetSupportedCpuId)?; + let msrs_to_save = crate::arch::x86_64::msr::get_msrs_to_save(&kvm_fd) + .map_err(KvmError::GetMsrsToSave)?; + + Ok(Kvm { + fd: kvm_fd, + max_memslots, + kvm_cap_modifiers, + supported_cpuid, + msrs_to_save, + }) + } + } + + /// Check guest memory does not have more regions than kvm allows. + pub fn check_memory(&self, guest_mem: &GuestMemoryMmap) -> Result<(), KvmError> { + if guest_mem.num_regions() > self.max_memslots { + Err(KvmError::NotEnoughMemorySlots) + } else { + Ok(()) + } + } + + fn combine_capabilities(kvm_cap_modifiers: &[KvmCapability]) -> Vec { + let mut total_caps = Self::DEFAULT_CAPABILITIES.to_vec(); + for modifier in kvm_cap_modifiers.iter() { + match modifier { + KvmCapability::Add(cap) => { + if !total_caps.contains(cap) { + total_caps.push(*cap); + } + } + KvmCapability::Remove(cap) => { + if let Some(pos) = total_caps.iter().position(|c| c == cap) { + total_caps.swap_remove(pos); + } + } + } + } + total_caps + } + + fn check_capabilities(kvm_fd: &KvmFd, capabilities: &[u32]) -> Result<(), u32> { + for cap in capabilities { + // If capability is not supported kernel will return 0. + if kvm_fd.check_extension_raw(u64::from(*cap)) == 0 { + return Err(*cap); + } + } + Ok(()) + } + + /// Saves and returns the Kvm state. + pub fn save_state(&self) -> KvmState { + KvmState { + kvm_cap_modifiers: self.kvm_cap_modifiers.clone(), + } + } +} + +#[cfg(target_arch = "aarch64")] +impl Kvm { + const DEFAULT_CAPABILITIES: [u32; 7] = [ + kvm_bindings::KVM_CAP_IOEVENTFD, + kvm_bindings::KVM_CAP_IRQFD, + kvm_bindings::KVM_CAP_USER_MEMORY, + kvm_bindings::KVM_CAP_ARM_PSCI_0_2, + kvm_bindings::KVM_CAP_DEVICE_CTRL, + kvm_bindings::KVM_CAP_MP_STATE, + kvm_bindings::KVM_CAP_ONE_REG, + ]; +} + +#[cfg(target_arch = "x86_64")] +impl Kvm { + const DEFAULT_CAPABILITIES: [u32; 14] = [ + kvm_bindings::KVM_CAP_IRQCHIP, + kvm_bindings::KVM_CAP_IOEVENTFD, + kvm_bindings::KVM_CAP_IRQFD, + kvm_bindings::KVM_CAP_USER_MEMORY, + kvm_bindings::KVM_CAP_SET_TSS_ADDR, + kvm_bindings::KVM_CAP_PIT2, + kvm_bindings::KVM_CAP_PIT_STATE2, + kvm_bindings::KVM_CAP_ADJUST_CLOCK, + kvm_bindings::KVM_CAP_DEBUGREGS, + kvm_bindings::KVM_CAP_MP_STATE, + kvm_bindings::KVM_CAP_VCPU_EVENTS, + kvm_bindings::KVM_CAP_XCRS, + kvm_bindings::KVM_CAP_XSAVE, + kvm_bindings::KVM_CAP_EXT_CPUID, + ]; +} + +/// Structure holding an general specific VM state. +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct KvmState { + /// Additional capabilities that were specified in cpu template. + pub kvm_cap_modifiers: Vec, +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + + #[test] + fn test_combine_capabilities() { + // Default caps for x86_64 and aarch64 both have KVM_CAP_IOEVENTFD and don't have + // KVM_CAP_IOMMU caps. + let additional_capabilities = vec![ + KvmCapability::Add(kvm_bindings::KVM_CAP_IOMMU), + KvmCapability::Remove(kvm_bindings::KVM_CAP_IOEVENTFD), + ]; + + let combined_caps = Kvm::combine_capabilities(&additional_capabilities); + assert!(combined_caps + .iter() + .any(|c| *c == kvm_bindings::KVM_CAP_IOMMU)); + assert!(!combined_caps + .iter() + .any(|c| *c == kvm_bindings::KVM_CAP_IOEVENTFD)); + } +} diff --git a/src/vmm/src/vstate/mod.rs b/src/vmm/src/vstate/mod.rs index 32d7bd7ea7f..47458835e04 100644 --- a/src/vmm/src/vstate/mod.rs +++ b/src/vmm/src/vstate/mod.rs @@ -1,6 +1,8 @@ // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +/// Module with Kvm implementation. +pub mod kvm; /// Module with GuestMemory implementation. pub mod memory; /// Module with Vcpu implementation. diff --git a/src/vmm/src/vstate/vcpu/aarch64.rs b/src/vmm/src/vstate/vcpu/aarch64.rs index 4e006f196d0..4097ef59044 100644 --- a/src/vmm/src/vstate/vcpu/aarch64.rs +++ b/src/vmm/src/vstate/vcpu/aarch64.rs @@ -22,6 +22,7 @@ use crate::cpu_config::aarch64::custom_cpu_template::VcpuFeatures; use crate::cpu_config::templates::CpuConfiguration; use crate::logger::{error, IncMetric, METRICS}; use crate::vcpu::{VcpuConfig, VcpuError}; +use crate::vstate::kvm::Kvm; use crate::vstate::memory::{Address, GuestAddress, GuestMemoryMmap}; use crate::vstate::vcpu::VcpuEmulation; use crate::vstate::vm::Vm; @@ -77,7 +78,7 @@ impl KvmVcpu { /// /// * `index` - Represents the 0-based CPU index between [0, max vcpus). /// * `vm` - The vm to which this vcpu will get attached. - pub fn new(index: u8, vm: &Vm) -> Result { + pub fn new(index: u8, vm: &Vm, _: &Kvm) -> Result { let kvm_vcpu = vm .fd() .create_vcpu(index.into()) @@ -305,26 +306,27 @@ mod tests { use crate::cpu_config::aarch64::CpuConfiguration; use crate::cpu_config::templates::RegisterValueFilter; use crate::vcpu::VcpuConfig; + use crate::vstate::kvm::Kvm; use crate::vstate::memory::GuestMemoryMmap; - use crate::vstate::vm::tests::setup_vm; + use crate::vstate::vm::tests::setup_vm_with_memory; use crate::vstate::vm::Vm; - fn setup_vcpu(mem_size: usize) -> (Vm, KvmVcpu, GuestMemoryMmap) { - let (mut vm, vm_mem) = setup_vm(mem_size); - let mut vcpu = KvmVcpu::new(0, &vm).unwrap(); + fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, KvmVcpu, GuestMemoryMmap) { + let (kvm, mut vm, vm_mem) = setup_vm_with_memory(mem_size); + let mut vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); vcpu.init(&[]).unwrap(); vm.setup_irqchip(1).unwrap(); - (vm, vcpu, vm_mem) + (kvm, vm, vcpu, vm_mem) } #[test] fn test_create_vcpu() { - let (vm, _) = setup_vm(0x1000); + let (kvm, vm, _) = setup_vm_with_memory(0x1000); unsafe { libc::close(vm.fd().as_raw_fd()) }; - let err = KvmVcpu::new(0, &vm); + let err = KvmVcpu::new(0, &vm, &kvm); assert_eq!( err.err().unwrap().to_string(), "Error creating vcpu: Bad file descriptor (os error 9)".to_string() @@ -336,7 +338,7 @@ mod tests { #[test] fn test_configure_vcpu() { - let (_vm, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let (_, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); let vcpu_config = VcpuConfig { vcpu_count: 1, @@ -371,8 +373,8 @@ mod tests { #[test] fn test_init_vcpu() { - let (mut vm, _vm_mem) = setup_vm(0x1000); - let mut vcpu = KvmVcpu::new(0, &vm).unwrap(); + let (kvm, mut vm, _) = setup_vm_with_memory(0x1000); + let mut vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); vm.setup_irqchip(1).unwrap(); // KVM_ARM_VCPU_PSCI_0_2 is set by default. @@ -390,8 +392,8 @@ mod tests { #[test] fn test_vcpu_save_restore_state() { - let (mut vm, _vm_mem) = setup_vm(0x1000); - let mut vcpu = KvmVcpu::new(0, &vm).unwrap(); + let (kvm, mut vm, _) = setup_vm_with_memory(0x1000); + let mut vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); vm.setup_irqchip(1).unwrap(); // Calling KVM_GET_REGLIST before KVM_VCPU_INIT will result in error. @@ -434,8 +436,8 @@ mod tests { // // This should fail with ENOEXEC. // https://elixir.bootlin.com/linux/v5.10.176/source/arch/arm64/kvm/arm.c#L1165 - let (mut vm, _vm_mem) = setup_vm(0x1000); - let vcpu = KvmVcpu::new(0, &vm).unwrap(); + let (kvm, mut vm, _) = setup_vm_with_memory(0x1000); + let vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); vm.setup_irqchip(1).unwrap(); vcpu.dump_cpu_config().unwrap_err(); @@ -444,8 +446,8 @@ mod tests { #[test] fn test_dump_cpu_config_after_init() { // Test `dump_cpu_config()` after `KVM_VCPU_INIT`. - let (mut vm, _vm_mem) = setup_vm(0x1000); - let mut vcpu = KvmVcpu::new(0, &vm).unwrap(); + let (kvm, mut vm, _) = setup_vm_with_memory(0x1000); + let mut vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); vm.setup_irqchip(1).unwrap(); vcpu.init(&[]).unwrap(); @@ -454,10 +456,10 @@ mod tests { #[test] fn test_setup_non_boot_vcpu() { - let (vm, _) = setup_vm(0x1000); - let mut vcpu1 = KvmVcpu::new(0, &vm).unwrap(); + let (kvm, vm, _) = setup_vm_with_memory(0x1000); + let mut vcpu1 = KvmVcpu::new(0, &vm, &kvm).unwrap(); vcpu1.init(&[]).unwrap(); - let mut vcpu2 = KvmVcpu::new(1, &vm).unwrap(); + let mut vcpu2 = KvmVcpu::new(1, &vm, &kvm).unwrap(); vcpu2.init(&[]).unwrap(); } @@ -466,7 +468,7 @@ mod tests { // Test `get_regs()` with valid register IDs. // - X0: 0x6030 0000 0010 0000 // - X1: 0x6030 0000 0010 0002 - let (_, vcpu, _) = setup_vcpu(0x10000); + let (_, _, vcpu, _) = setup_vcpu(0x10000); let reg_list = Vec::::from([0x6030000000100000, 0x6030000000100002]); get_registers(&vcpu.fd, ®_list, &mut Aarch64RegisterVec::default()).unwrap(); } @@ -474,7 +476,7 @@ mod tests { #[test] fn test_get_invalid_regs() { // Test `get_regs()` with invalid register IDs. - let (_, vcpu, _) = setup_vcpu(0x10000); + let (_, _, vcpu, _) = setup_vcpu(0x10000); let reg_list = Vec::::from([0x6030000000100001, 0x6030000000100003]); get_registers(&vcpu.fd, ®_list, &mut Aarch64RegisterVec::default()).unwrap_err(); } diff --git a/src/vmm/src/vstate/vcpu/mod.rs b/src/vmm/src/vstate/vcpu/mod.rs index ddfeda21b4c..3d8877285a4 100644 --- a/src/vmm/src/vstate/vcpu/mod.rs +++ b/src/vmm/src/vstate/vcpu/mod.rs @@ -44,6 +44,8 @@ pub use aarch64::{KvmVcpuError, *}; #[cfg(target_arch = "x86_64")] pub use x86_64::{KvmVcpuError, *}; +use super::kvm::Kvm; + /// Signal number (SIGRTMIN) used to kick Vcpus. pub const VCPU_RTSIG_OFFSET: i32 = 0; @@ -212,10 +214,10 @@ impl Vcpu { /// * `index` - Represents the 0-based CPU index between [0, max vcpus). /// * `vm` - The vm to which this vcpu will get attached. /// * `exit_evt` - An `EventFd` that will be written into when this vcpu exits. - pub fn new(index: u8, vm: &Vm, exit_evt: EventFd) -> Result { + pub fn new(index: u8, vm: &Vm, kvm: &Kvm, exit_evt: EventFd) -> Result { let (event_sender, event_receiver) = channel(); let (response_sender, response_receiver) = channel(); - let kvm_vcpu = KvmVcpu::new(index, vm).unwrap(); + let kvm_vcpu = KvmVcpu::new(index, vm, kvm).unwrap(); Ok(Vcpu { exit_evt, @@ -777,13 +779,13 @@ pub(crate) mod tests { use crate::utils::signal::validate_signal_num; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; use crate::vstate::vcpu::VcpuError as EmulationError; - use crate::vstate::vm::tests::setup_vm; + use crate::vstate::vm::tests::setup_vm_with_memory; use crate::vstate::vm::Vm; use crate::RECV_TIMEOUT_SEC; #[test] fn test_handle_kvm_exit() { - let (_vm, mut vcpu, _vm_mem) = setup_vcpu(0x1000); + let (_, _, mut vcpu, _vm_mem) = setup_vcpu(0x1000); let res = handle_kvm_exit(&mut vcpu.kvm_vcpu.peripherals, Ok(VcpuExit::Hlt)); assert_eq!(res.unwrap(), VcpuEmulation::Stopped); @@ -918,14 +920,14 @@ pub(crate) mod tests { // Auxiliary function being used throughout the tests. #[allow(unused_mut)] - pub(crate) fn setup_vcpu(mem_size: usize) -> (Vm, Vcpu, GuestMemoryMmap) { - let (mut vm, gm) = setup_vm(mem_size); + pub(crate) fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, Vcpu, GuestMemoryMmap) { + let (kvm, mut vm, gm) = setup_vm_with_memory(mem_size); let exit_evt = EventFd::new(libc::EFD_NONBLOCK).unwrap(); #[cfg(target_arch = "aarch64")] let vcpu = { - let mut vcpu = Vcpu::new(1, &vm, exit_evt).unwrap(); + let mut vcpu = Vcpu::new(1, &vm, &kvm, exit_evt).unwrap(); vcpu.kvm_vcpu.init(&[]).unwrap(); vm.setup_irqchip(1).unwrap(); vcpu @@ -933,9 +935,9 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] let vcpu = { vm.setup_irqchip().unwrap(); - Vcpu::new(1, &vm, exit_evt).unwrap() + Vcpu::new(1, &vm, &kvm, exit_evt).unwrap() }; - (vm, vcpu, gm) + (kvm, vm, vcpu, gm) } fn load_good_kernel(vm_memory: &GuestMemoryMmap) -> GuestAddress { @@ -970,7 +972,7 @@ pub(crate) mod tests { Vcpu::register_kick_signal_handler(); // Need enough mem to boot linux. let mem_size = 64 << 20; - let (_vm, mut vcpu, vm_mem) = setup_vcpu(mem_size); + let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(mem_size); let vcpu_exit_evt = vcpu.exit_evt.try_clone().unwrap(); @@ -988,7 +990,7 @@ pub(crate) mod tests { vcpu_count: 1, smt: false, cpu_config: CpuConfiguration { - cpuid: Cpuid::try_from(_vm.supported_cpuid().clone()).unwrap(), + cpuid: Cpuid::try_from(kvm.supported_cpuid.clone()).unwrap(), msrs: BTreeMap::new(), }, }, @@ -1022,7 +1024,7 @@ pub(crate) mod tests { #[test] fn test_set_mmio_bus() { - let (_, mut vcpu, _) = setup_vcpu(0x1000); + let (_, _, mut vcpu, _) = setup_vcpu(0x1000); assert!(vcpu.kvm_vcpu.peripherals.mmio_bus.is_none()); vcpu.set_mmio_bus(crate::devices::Bus::new()); assert!(vcpu.kvm_vcpu.peripherals.mmio_bus.is_some()); @@ -1030,7 +1032,7 @@ pub(crate) mod tests { #[test] fn test_vcpu_tls() { - let (_, mut vcpu, _) = setup_vcpu(0x1000); + let (_, _, mut vcpu, _) = setup_vcpu(0x1000); // Running on the TLS vcpu should fail before we actually initialize it. unsafe { @@ -1061,7 +1063,7 @@ pub(crate) mod tests { #[test] fn test_invalid_tls() { - let (_, mut vcpu, _) = setup_vcpu(0x1000); + let (_, _, mut vcpu, _) = setup_vcpu(0x1000); // Initialize vcpu TLS. vcpu.init_thread_local_data().unwrap(); // Trying to initialize non-empty TLS should error. @@ -1071,7 +1073,7 @@ pub(crate) mod tests { #[test] fn test_vcpu_kick() { Vcpu::register_kick_signal_handler(); - let (vm, mut vcpu, _) = setup_vcpu(0x1000); + let (_, vm, mut vcpu, _) = setup_vcpu(0x1000); let mut kvm_run = kvm_ioctls::KvmRunWrapper::mmap_from_fd(&vcpu.kvm_vcpu.fd, vm.fd().run_size()) @@ -1126,7 +1128,7 @@ pub(crate) mod tests { #[test] fn test_immediate_exit_shortcircuits_execution() { - let (_vm, mut vcpu, _) = setup_vcpu(0x1000); + let (_, _, mut vcpu, _) = setup_vcpu(0x1000); vcpu.kvm_vcpu.fd.set_kvm_immediate_exit(1); // Set a dummy value to be returned by the emulate call diff --git a/src/vmm/src/vstate/vcpu/x86_64.rs b/src/vmm/src/vstate/vcpu/x86_64.rs index 4043691130d..39ff0879ee8 100644 --- a/src/vmm/src/vstate/vcpu/x86_64.rs +++ b/src/vmm/src/vstate/vcpu/x86_64.rs @@ -23,6 +23,7 @@ use crate::arch::x86_64::msr::{create_boot_msr_entries, MsrError}; use crate::arch::x86_64::regs::{SetupFpuError, SetupRegistersError, SetupSpecialRegistersError}; use crate::cpu_config::x86_64::{cpuid, CpuConfiguration}; use crate::logger::{IncMetric, METRICS}; +use crate::vstate::kvm::Kvm; use crate::vstate::memory::{Address, GuestAddress, GuestMemoryMmap}; use crate::vstate::vcpu::{VcpuConfig, VcpuEmulation}; use crate::vstate::vm::Vm; @@ -164,7 +165,7 @@ impl KvmVcpu { /// /// * `index` - Represents the 0-based CPU index between [0, max vcpus). /// * `vm` - The vm to which this vcpu will get attached. - pub fn new(index: u8, vm: &Vm) -> Result { + pub fn new(index: u8, vm: &Vm, kvm: &Kvm) -> Result { let kvm_vcpu = vm .fd() .create_vcpu(index.into()) @@ -174,7 +175,7 @@ impl KvmVcpu { index, fd: kvm_vcpu, peripherals: Default::default(), - msrs_to_save: vm.msrs_to_save().as_slice().to_vec(), + msrs_to_save: kvm.msrs_to_save.as_slice().to_vec(), }) } @@ -716,7 +717,7 @@ mod tests { #![allow(clippy::undocumented_unsafe_blocks)] use kvm_bindings::kvm_msr_entry; - use kvm_ioctls::{Cap, Kvm}; + use kvm_ioctls::Cap; use super::*; use crate::arch::x86_64::cpu_model::CpuModel; @@ -725,7 +726,7 @@ mod tests { StaticCpuTemplate, }; use crate::cpu_config::x86_64::cpuid::{Cpuid, CpuidEntry, CpuidKey}; - use crate::vstate::vm::tests::setup_vm; + use crate::vstate::vm::tests::{setup_vm, setup_vm_with_memory}; use crate::vstate::vm::Vm; impl Default for VcpuState { @@ -746,11 +747,11 @@ mod tests { } } - fn setup_vcpu(mem_size: usize) -> (Vm, KvmVcpu, GuestMemoryMmap) { - let (vm, vm_mem) = setup_vm(mem_size); + fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, KvmVcpu, GuestMemoryMmap) { + let (kvm, vm, vm_mem) = setup_vm_with_memory(mem_size); vm.setup_irqchip().unwrap(); - let vcpu = KvmVcpu::new(0, &vm).unwrap(); - (vm, vcpu, vm_mem) + let vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); + (kvm, vm, vcpu, vm_mem) } fn is_at_least_cascade_lake() -> bool { @@ -765,11 +766,11 @@ mod tests { } fn create_vcpu_config( - vm: &Vm, + kvm: &Kvm, vcpu: &KvmVcpu, template: &CustomCpuTemplate, ) -> Result { - let cpuid = Cpuid::try_from(vm.supported_cpuid().clone()) + let cpuid = Cpuid::try_from(kvm.supported_cpuid.clone()) .map_err(GuestConfigError::CpuidFromKvmCpuid)?; let msrs = vcpu .get_msrs(template.msr_index_iter()) @@ -785,19 +786,19 @@ mod tests { #[test] fn test_configure_vcpu() { - let (vm, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); - let vcpu_config = create_vcpu_config(&vm, &vcpu, &CustomCpuTemplate::default()).unwrap(); + let vcpu_config = create_vcpu_config(&kvm, &vcpu, &CustomCpuTemplate::default()).unwrap(); assert_eq!( vcpu.configure(&vm_mem, GuestAddress(0), &vcpu_config,), Ok(()) ); - let try_configure = |vm: &Vm, vcpu: &mut KvmVcpu, template| -> bool { + let try_configure = |kvm: &Kvm, vcpu: &mut KvmVcpu, template| -> bool { let cpu_template = Some(CpuTemplateType::Static(template)); let template = cpu_template.get_cpu_template(); match template { - Ok(template) => match create_vcpu_config(vm, vcpu, &template) { + Ok(template) => match create_vcpu_config(kvm, vcpu, &template) { Ok(config) => vcpu .configure( &vm_mem, @@ -812,19 +813,19 @@ mod tests { }; // Test configure while using the T2 template. - let t2_res = try_configure(&vm, &mut vcpu, StaticCpuTemplate::T2); + let t2_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2); // Test configure while using the C3 template. - let c3_res = try_configure(&vm, &mut vcpu, StaticCpuTemplate::C3); + let c3_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::C3); // Test configure while using the T2S template. - let t2s_res = try_configure(&vm, &mut vcpu, StaticCpuTemplate::T2S); + let t2s_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2S); // Test configure while using the T2CL template. - let t2cl_res = try_configure(&vm, &mut vcpu, StaticCpuTemplate::T2CL); + let t2cl_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2CL); // Test configure while using the T2S template. - let t2a_res = try_configure(&vm, &mut vcpu, StaticCpuTemplate::T2A); + let t2a_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2A); match &cpuid::common::get_vendor_id_from_host().unwrap() { cpuid::VENDOR_ID_INTEL => { @@ -857,8 +858,8 @@ mod tests { #[test] fn test_vcpu_cpuid_restore() { - let (vm, vcpu, _mem) = setup_vcpu(0x10000); - vcpu.fd.set_cpuid2(vm.supported_cpuid()).unwrap(); + let (kvm, _, vcpu, _mem) = setup_vcpu(0x10000); + vcpu.fd.set_cpuid2(&kvm.supported_cpuid).unwrap(); // Mutate the CPUID. // Leaf 0x3 / EAX that is an unused (reserved to be accurate) register, so it's harmless. @@ -875,7 +876,7 @@ mod tests { drop(vcpu); // Restore the state into a new vcpu. - let (_vm, vcpu, _mem) = setup_vcpu(0x10000); + let (_, _vm, vcpu, _mem) = setup_vcpu(0x10000); let result2 = vcpu.restore_state(&state); assert!(result2.is_ok(), "{}", result2.unwrap_err()); @@ -895,12 +896,12 @@ mod tests { #[test] fn test_empty_cpuid_entries_removed() { // Test that `get_cpuid()` removes zeroed empty entries from the `KVM_GET_CPUID2` result. - let (vm, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); let vcpu_config = VcpuConfig { vcpu_count: 1, smt: false, cpu_config: CpuConfiguration { - cpuid: Cpuid::try_from(vm.supported_cpuid().clone()).unwrap(), + cpuid: Cpuid::try_from(kvm.supported_cpuid.clone()).unwrap(), msrs: BTreeMap::new(), }, }; @@ -946,7 +947,7 @@ mod tests { // Since `KVM_SET_CPUID2` has not been called before vcpu configuration, all leaves should // be filled with zero. Therefore, `KvmVcpu::dump_cpu_config()` should fail with CPUID type // conversion error due to the lack of brand string info in leaf 0x0. - let (_, vcpu, _) = setup_vcpu(0x10000); + let (_, _, vcpu, _) = setup_vcpu(0x10000); match vcpu.dump_cpu_config() { Err(KvmVcpuError::ConvertCpuidType(_)) => (), Err(err) => panic!("Unexpected error: {err}"), @@ -957,12 +958,12 @@ mod tests { #[test] fn test_dump_cpu_config_with_configured_vcpu() { // Test `dump_cpu_config()` after vcpu configuration. - let (vm, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); let vcpu_config = VcpuConfig { vcpu_count: 1, smt: false, cpu_config: CpuConfiguration { - cpuid: Cpuid::try_from(vm.supported_cpuid().clone()).unwrap(), + cpuid: Cpuid::try_from(kvm.supported_cpuid.clone()).unwrap(), msrs: BTreeMap::new(), }, }; @@ -976,7 +977,7 @@ mod tests { fn test_is_tsc_scaling_required() { // Test `is_tsc_scaling_required` as if it were on the same // CPU model as the one in the snapshot state. - let (_vm, vcpu, _) = setup_vcpu(0x1000); + let (_, _, vcpu, _) = setup_vcpu(0x1000); { // The frequency difference is within tolerance. @@ -1015,7 +1016,7 @@ mod tests { #[test] fn test_set_tsc() { - let (vm, vcpu, _) = setup_vcpu(0x1000); + let (kvm, _, vcpu, _) = setup_vcpu(0x1000); let mut state = vcpu.save_state().unwrap(); state.tsc_khz = Some( state.tsc_khz.unwrap() @@ -1024,9 +1025,9 @@ mod tests { * 2, ); - if vm.fd().check_extension(Cap::TscControl) { + if kvm.fd.check_extension(Cap::TscControl) { vcpu.set_tsc_khz(state.tsc_khz.unwrap()).unwrap(); - if vm.fd().check_extension(Cap::GetTscKhz) { + if kvm.fd.check_extension(Cap::GetTscKhz) { assert_eq!(vcpu.get_tsc_khz().ok(), state.tsc_khz); } else { vcpu.get_tsc_khz().unwrap_err(); @@ -1040,7 +1041,7 @@ mod tests { fn test_get_msrs_with_msrs_to_save() { // Test `get_msrs()` with the MSR indices that should be serialized into snapshots. // The MSR indices should be valid and this test should succeed. - let (_, vcpu, _) = setup_vcpu(0x1000); + let (_, _, vcpu, _) = setup_vcpu(0x1000); vcpu.get_msrs(vcpu.msrs_to_save.iter().copied()).unwrap(); } @@ -1048,7 +1049,7 @@ mod tests { fn test_get_msrs_with_msrs_to_dump() { // Test `get_msrs()` with the MSR indices that should be dumped. // All the MSR indices should be valid and the call should succeed. - let (_, vcpu, _) = setup_vcpu(0x1000); + let (_, _, vcpu, _) = setup_vcpu(0x1000); let kvm = kvm_ioctls::Kvm::new().unwrap(); let msrs_to_dump = crate::arch::x86_64::msr::get_msrs_to_dump(&kvm).unwrap(); @@ -1061,7 +1062,7 @@ mod tests { // Test `get_msrs()` with unsupported MSR indices. This should return `VcpuGetMsr` error // that happens when `KVM_GET_MSRS` fails to populate MSR values in the middle and exits. // Currently, MSR indices 2..=4 are not listed as supported MSRs. - let (_, vcpu, _) = setup_vcpu(0x1000); + let (_, _, vcpu, _) = setup_vcpu(0x1000); let msr_index_list: Vec = vec![2, 3, 4]; match vcpu.get_msrs(msr_index_list.iter().copied()) { Err(KvmVcpuError::VcpuGetMsr(_)) => (), @@ -1167,13 +1168,11 @@ mod tests { #[test] fn test_get_msr_chunks_preserved_order() { // Regression test for #4666 - - let kvm = Kvm::new().unwrap(); - let vm = Vm::new(Vec::new()).unwrap(); - let vcpu = KvmVcpu::new(0, &vm).unwrap(); + let (kvm, vm) = setup_vm(); + let vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); // The list of supported MSR indices, in the order they were returned by KVM - let msrs_to_save = crate::arch::x86_64::msr::get_msrs_to_save(&kvm).unwrap(); + let msrs_to_save = kvm.msrs_to_save; // The MSRs after processing. The order should be identical to the one returned by KVM, with // the exception of deferred MSRs, which should be moved to the end (but show up in the same // order as they are listed in [`DEFERRED_MSRS`]. diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index d213d4d7bb6..1bcf191b8b9 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -10,21 +10,21 @@ use std::fmt; #[cfg(target_arch = "x86_64")] use kvm_bindings::{ - kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, CpuId, MsrList, - KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, - KVM_MAX_CPUID_ENTRIES, KVM_PIT_SPEAKER_DUMMY, + kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, KVM_CLOCK_TSC_STABLE, + KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, KVM_PIT_SPEAKER_DUMMY, }; -use kvm_bindings::{kvm_userspace_memory_region, KVM_API_VERSION, KVM_MEM_LOG_DIRTY_PAGES}; -use kvm_ioctls::{Kvm, VmFd}; +use kvm_bindings::{kvm_userspace_memory_region, KVM_MEM_LOG_DIRTY_PAGES}; +// use kvm_ioctls::{Kvm, VmFd}; +use kvm_ioctls::VmFd; use serde::{Deserialize, Serialize}; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::gic::GICDevice; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::gic::GicState; -use crate::cpu_config::templates::KvmCapability; #[cfg(target_arch = "x86_64")] use crate::utils::u64_to_usize; +use crate::vstate::kvm::Kvm; use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; /// Errors associated with the wrappers over KVM ioctls. @@ -32,18 +32,6 @@ use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, GuestMemoryRe #[rustfmt::skip] #[derive(Debug, PartialEq, Eq, thiserror::Error, displaydoc::Display)] pub enum VmError { - /// The host kernel reports an invalid KVM API version: {0} - ApiVersion(i32), - /// Missing KVM capabilities: {0:#x?} - Capabilities(u32), - /** Error creating KVM object: {0} Make sure the user launching the firecracker process is \ - configured on the /dev/kvm file's ACL. */ - Kvm(kvm_ioctls::Error), - #[cfg(target_arch = "x86_64")] - /// Failed to get MSR index list to save into snapshots: {0} - GetMsrsToSave(#[from] crate::arch::x86_64::msr::MsrError), - /// The number of configured slots is bigger than the maximum reported by KVM - NotEnoughMemorySlots, /// Cannot set the memory regions: {0} SetUserMemoryRegion(kvm_ioctls::Error), #[cfg(target_arch = "aarch64")] @@ -112,16 +100,6 @@ pub enum RestoreStateError { #[derive(Debug)] pub struct Vm { fd: VmFd, - max_memslots: usize, - - /// Additional capabilities that were specified in cpu template. - pub kvm_cap_modifiers: Vec, - - // X86 specific fields. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - supported_cpuid: CpuId, - #[cfg(target_arch = "x86_64")] - msrs_to_save: MsrList, // Arm specific fields. // On aarch64 we need to keep around the fd obtained by creating the VGIC device. @@ -131,79 +109,23 @@ pub struct Vm { /// Contains Vm functions that are usable across CPU architectures impl Vm { - /// Constructs a new `Vm` using the given `Kvm` instance. - pub fn new(kvm_cap_modifiers: Vec) -> Result { - let kvm = Kvm::new().map_err(VmError::Kvm)?; - - // Check that KVM has the correct version. - // Safe to cast because this is a constant. - #[allow(clippy::cast_possible_wrap)] - if kvm.get_api_version() != KVM_API_VERSION as i32 { - return Err(VmError::ApiVersion(kvm.get_api_version())); - } - - let total_caps = Self::combine_capabilities(&kvm_cap_modifiers); - // Check that all desired capabilities are supported. - Self::check_capabilities(&kvm, &total_caps).map_err(VmError::Capabilities)?; - - let max_memslots = kvm.get_nr_memslots(); + /// Create a new `Vm` struct. + pub fn new(kvm: &Kvm) -> Result { // Create fd for interacting with kvm-vm specific functions. - let vm_fd = kvm.create_vm().map_err(VmError::VmFd)?; + let vm_fd = kvm.fd.create_vm().map_err(VmError::VmFd)?; #[cfg(target_arch = "aarch64")] { Ok(Vm { fd: vm_fd, - max_memslots, - kvm_cap_modifiers, irqchip_handle: None, }) } #[cfg(target_arch = "x86_64")] { - let supported_cpuid = kvm - .get_supported_cpuid(KVM_MAX_CPUID_ENTRIES) - .map_err(VmError::VmFd)?; - let msrs_to_save = crate::arch::x86_64::msr::get_msrs_to_save(&kvm)?; - - Ok(Vm { - fd: vm_fd, - max_memslots, - kvm_cap_modifiers, - supported_cpuid, - msrs_to_save, - }) - } - } - - fn combine_capabilities(kvm_cap_modifiers: &[KvmCapability]) -> Vec { - let mut total_caps = Self::DEFAULT_CAPABILITIES.to_vec(); - for modifier in kvm_cap_modifiers.iter() { - match modifier { - KvmCapability::Add(cap) => { - if !total_caps.iter().any(|c| c == cap) { - total_caps.push(*cap); - } - } - KvmCapability::Remove(cap) => { - if let Some(pos) = total_caps.iter().position(|c| c == cap) { - total_caps.remove(pos); - } - } - } + Ok(Vm { fd: vm_fd }) } - total_caps - } - - fn check_capabilities(kvm: &Kvm, capabilities: &[u32]) -> Result<(), u32> { - for cap in capabilities { - // If capability is not supported kernel will return 0. - if kvm.check_extension_raw(u64::from(*cap)) == 0 { - return Err(*cap); - } - } - Ok(()) } /// Initializes the guest memory. @@ -212,9 +134,6 @@ impl Vm { guest_mem: &GuestMemoryMmap, track_dirty_pages: bool, ) -> Result<(), VmError> { - if guest_mem.num_regions() > self.max_memslots { - return Err(VmError::NotEnoughMemorySlots); - } self.set_kvm_memory_regions(guest_mem, track_dirty_pages)?; #[cfg(target_arch = "x86_64")] self.fd @@ -261,16 +180,6 @@ impl Vm { #[cfg(target_arch = "aarch64")] impl Vm { - const DEFAULT_CAPABILITIES: [u32; 7] = [ - kvm_bindings::KVM_CAP_IOEVENTFD, - kvm_bindings::KVM_CAP_IRQFD, - kvm_bindings::KVM_CAP_USER_MEMORY, - kvm_bindings::KVM_CAP_ARM_PSCI_0_2, - kvm_bindings::KVM_CAP_DEVICE_CTRL, - kvm_bindings::KVM_CAP_MP_STATE, - kvm_bindings::KVM_CAP_ONE_REG, - ]; - /// Creates the GIC (Global Interrupt Controller). pub fn setup_irqchip(&mut self, vcpu_count: u8) -> Result<(), VmError> { self.irqchip_handle = Some( @@ -292,7 +201,6 @@ impl Vm { .get_irqchip() .save_device(mpidrs) .map_err(VmError::SaveGic)?, - kvm_cap_modifiers: self.kvm_cap_modifiers.clone(), }) } @@ -319,39 +227,10 @@ impl Vm { pub struct VmState { /// GIC state. pub gic: GicState, - /// Additional capabilities that were specified in cpu template. - pub kvm_cap_modifiers: Vec, } #[cfg(target_arch = "x86_64")] impl Vm { - const DEFAULT_CAPABILITIES: [u32; 14] = [ - kvm_bindings::KVM_CAP_IRQCHIP, - kvm_bindings::KVM_CAP_IOEVENTFD, - kvm_bindings::KVM_CAP_IRQFD, - kvm_bindings::KVM_CAP_USER_MEMORY, - kvm_bindings::KVM_CAP_SET_TSS_ADDR, - kvm_bindings::KVM_CAP_PIT2, - kvm_bindings::KVM_CAP_PIT_STATE2, - kvm_bindings::KVM_CAP_ADJUST_CLOCK, - kvm_bindings::KVM_CAP_DEBUGREGS, - kvm_bindings::KVM_CAP_MP_STATE, - kvm_bindings::KVM_CAP_VCPU_EVENTS, - kvm_bindings::KVM_CAP_XCRS, - kvm_bindings::KVM_CAP_XSAVE, - kvm_bindings::KVM_CAP_EXT_CPUID, - ]; - - /// Returns a ref to the supported `CpuId` for this Vm. - pub fn supported_cpuid(&self) -> &CpuId { - &self.supported_cpuid - } - - /// Returns a ref to the list of serializable MSR indices. - pub fn msrs_to_save(&self) -> &MsrList { - &self.msrs_to_save - } - /// Restores the KVM VM state. /// /// # Errors @@ -431,7 +310,6 @@ impl Vm { pic_master, pic_slave, ioapic, - kvm_cap_modifiers: self.kvm_cap_modifiers.clone(), }) } } @@ -447,9 +325,6 @@ pub struct VmState { // TODO: rename this field to adopt inclusive language once Linux updates it, too. pic_slave: kvm_irqchip, ioapic: kvm_irqchip, - - /// Additional capabilities that were specified in cpu template. - pub kvm_cap_modifiers: Vec, } #[cfg(target_arch = "x86_64")] @@ -474,43 +349,30 @@ pub(crate) mod tests { use crate::vstate::memory::GuestMemoryMmap; // Auxiliary function being used throughout the tests. - pub(crate) fn setup_vm(mem_size: usize) -> (Vm, GuestMemoryMmap) { - let gm = single_region_mem(mem_size); + pub(crate) fn setup_vm() -> (Kvm, Vm) { + let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); + let vm = Vm::new(&kvm).expect("Cannot create new vm"); + (kvm, vm) + } - let vm = Vm::new(vec![]).expect("Cannot create new vm"); + // Auxiliary function being used throughout the tests. + pub(crate) fn setup_vm_with_memory(mem_size: usize) -> (Kvm, Vm, GuestMemoryMmap) { + let (kvm, vm) = setup_vm(); + let gm = single_region_mem(mem_size); vm.memory_init(&gm, false).unwrap(); - - (vm, gm) + (kvm, vm, gm) } #[test] fn test_new() { // Testing with a valid /dev/kvm descriptor. - Vm::new(vec![]).unwrap(); - } - - #[test] - fn test_combine_capabilities() { - // Default caps for x86_64 and aarch64 both have KVM_CAP_IOEVENTFD and don't have - // KVM_CAP_IOMMU caps. - let additional_capabilities = vec![ - KvmCapability::Add(kvm_bindings::KVM_CAP_IOMMU), - KvmCapability::Remove(kvm_bindings::KVM_CAP_IOEVENTFD), - ]; - - let combined_caps = Vm::combine_capabilities(&additional_capabilities); - assert!(combined_caps - .iter() - .any(|c| *c == kvm_bindings::KVM_CAP_IOMMU)); - assert!(!combined_caps - .iter() - .any(|c| *c == kvm_bindings::KVM_CAP_IOEVENTFD)); + let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); + Vm::new(&kvm).unwrap(); } #[test] fn test_vm_memory_init() { - let vm = Vm::new(vec![]).expect("Cannot create new vm"); - + let (_, vm) = setup_vm(); // Create valid memory region and test that the initialization is successful. let gm = single_region_mem(0x1000); vm.memory_init(&gm, true).unwrap(); @@ -519,11 +381,11 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] #[test] fn test_vm_save_restore_state() { - let vm = Vm::new(vec![]).expect("new vm failed"); + let (_, vm) = setup_vm(); // Irqchips, clock and pitstate are not configured so trying to save state should fail. vm.save_state().unwrap_err(); - let (vm, _mem) = setup_vm(0x1000); + let (_, vm, _mem) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); let vm_state = vm.save_state().unwrap(); @@ -536,7 +398,7 @@ pub(crate) mod tests { assert_eq!(vm_state.pic_slave.chip_id, KVM_IRQCHIP_PIC_SLAVE); assert_eq!(vm_state.ioapic.chip_id, KVM_IRQCHIP_IOAPIC); - let (mut vm, _mem) = setup_vm(0x1000); + let (_, mut vm, _mem) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); vm.restore_state(&vm_state).unwrap(); @@ -547,11 +409,11 @@ pub(crate) mod tests { fn test_vm_save_restore_state_bad_irqchip() { use kvm_bindings::KVM_NR_IRQCHIPS; - let (vm, _mem) = setup_vm(0x1000); + let (_, vm, _mem) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); let mut vm_state = vm.save_state().unwrap(); - let (mut vm, _mem) = setup_vm(0x1000); + let (_, mut vm, _mem) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); // Try to restore an invalid PIC Master chip ID @@ -576,7 +438,7 @@ pub(crate) mod tests { fn test_vmstate_serde() { let mut snapshot_data = vec![0u8; 10000]; - let (mut vm, _) = setup_vm(0x1000); + let (_, mut vm, _) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); let state = vm.save_state().unwrap(); Snapshot::serialize(&mut snapshot_data.as_mut_slice(), &state).unwrap(); @@ -587,7 +449,7 @@ pub(crate) mod tests { #[test] fn test_set_kvm_memory_regions() { - let vm = Vm::new(vec![]).expect("Cannot create new vm"); + let (_, vm) = setup_vm(); let gm = single_region_mem(0x1000); let res = vm.set_kvm_memory_regions(&gm, false); From 87f7830da6dd9dfcc5ce2192161ef0e09fa02fa0 Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Fri, 10 Jan 2025 11:27:02 +0000 Subject: [PATCH 2/5] feat: reset KVM_REG_ARM_PTIMER_CNT on VM boot Reset KVM_REG_ARM_PTIMER_CNT physical counter register on VM boot to avoid passing through host physical counter. Note that resetting the register on VM boot does not guarantee that VM will see the counter value 0 at startup because there is a delta in time between register reset and VM boot during which counter continues to advance. In order to check if the kernel supports the counter reset we query KVM_CAP_COUNTER_OFFSET capability and only reset the KVM_REG_ARM_PTIMER_CNT if it is present. Signed-off-by: Egor Lazarchuk --- src/vmm/src/arch/aarch64/regs.rs | 6 +++++ src/vmm/src/arch/aarch64/vcpu.rs | 42 ++++++++++++++++++++++++++++-- src/vmm/src/builder.rs | 29 +++++++++++++++------ src/vmm/src/vstate/kvm.rs | 18 ++++++++++++- src/vmm/src/vstate/vcpu/aarch64.rs | 9 +++++-- src/vmm/src/vstate/vcpu/mod.rs | 1 + 6 files changed, 92 insertions(+), 13 deletions(-) diff --git a/src/vmm/src/arch/aarch64/regs.rs b/src/vmm/src/arch/aarch64/regs.rs index 5238f58ba70..d844fbfb56b 100644 --- a/src/vmm/src/arch/aarch64/regs.rs +++ b/src/vmm/src/arch/aarch64/regs.rs @@ -99,6 +99,12 @@ arm64_sys_reg!(SYS_CNTV_CVAL_EL0, 3, 3, 14, 3, 2); // https://elixir.bootlin.com/linux/v6.8/source/arch/arm64/include/asm/sysreg.h#L459 arm64_sys_reg!(SYS_CNTPCT_EL0, 3, 3, 14, 0, 1); +// Physical Timer EL0 count Register +// The id of this register is same as SYS_CNTPCT_EL0, but KVM defines it +// separately, so we do as well. +// https://elixir.bootlin.com/linux/v6.12.6/source/arch/arm64/include/uapi/asm/kvm.h#L259 +arm64_sys_reg!(KVM_REG_ARM_PTIMER_CNT, 3, 3, 14, 0, 1); + // Translation Table Base Register // https://developer.arm.com/documentation/ddi0595/2021-03/AArch64-Registers/TTBR1-EL1--Translation-Table-Base-Register-1--EL1- arm64_sys_reg!(TTBR1_EL1, 3, 0, 2, 0, 1); diff --git a/src/vmm/src/arch/aarch64/vcpu.rs b/src/vmm/src/arch/aarch64/vcpu.rs index 859e2da2cb6..7b34ae91896 100644 --- a/src/vmm/src/arch/aarch64/vcpu.rs +++ b/src/vmm/src/arch/aarch64/vcpu.rs @@ -13,6 +13,7 @@ use kvm_ioctls::VcpuFd; use super::get_fdt_addr; use super::regs::*; +use crate::vstate::kvm::OptionalCapabilities; use crate::vstate::memory::GuestMemoryMmap; /// Errors thrown while setting aarch64 registers. @@ -78,6 +79,7 @@ pub fn setup_boot_regs( cpu_id: u8, boot_ip: u64, mem: &GuestMemoryMmap, + optional_capabilities: &OptionalCapabilities, ) -> Result<(), VcpuError> { let kreg_off = offset_of!(kvm_regs, regs); @@ -106,6 +108,23 @@ pub fn setup_boot_regs( vcpufd .set_one_reg(id, &get_fdt_addr(mem).to_le_bytes()) .map_err(|err| VcpuError::SetOneReg(id, err))?; + + // Reset the physical counter for the guest. This way we avoid guest reading + // host physical counter. + // Resetting KVM_REG_ARM_PTIMER_CNT for single vcpu is enough because there is only + // one timer struct with offsets per VM. + // Because the access to KVM_REG_ARM_PTIMER_CNT is only present starting 6.4 kernel, + // we only do the reset if KVM_CAP_COUNTER_OFFSET is present as it was added + // in the same patch series as the ability to set the KVM_REG_ARM_PTIMER_CNT register. + // Path series which introduced the needed changes: + // https://lore.kernel.org/all/20230330174800.2677007-1-maz@kernel.org/ + // Note: the value observed by the guest will still be above 0, because there is a delta + // time between this resetting and first call to KVM_RUN. + if optional_capabilities.counter_offset { + vcpufd + .set_one_reg(KVM_REG_ARM_PTIMER_CNT, &[0; 8]) + .map_err(|err| VcpuError::SetOneReg(id, err))?; + } } Ok(()) } @@ -226,8 +245,9 @@ mod tests { let vm = kvm.fd.create_vm().unwrap(); let vcpu = vm.create_vcpu(0).unwrap(); let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); + let optional_capabilities = kvm.optional_capabilities(); - let res = setup_boot_regs(&vcpu, 0, 0x0, &mem); + let res = setup_boot_regs(&vcpu, 0, 0x0, &mem, &optional_capabilities); assert!(matches!( res.unwrap_err(), VcpuError::SetOneReg(0x6030000000100042, _) @@ -237,7 +257,25 @@ mod tests { vm.get_preferred_target(&mut kvi).unwrap(); vcpu.vcpu_init(&kvi).unwrap(); - setup_boot_regs(&vcpu, 0, 0x0, &mem).unwrap(); + setup_boot_regs(&vcpu, 0, 0x0, &mem, &optional_capabilities).unwrap(); + + // Check that the register is reset on compatible kernels. + // Because there is a delta in time between we reset the register and time we + // read it, we cannot compare with 0. Instead we compare it with meaningfully + // small value. + if optional_capabilities.counter_offset { + let mut reg_bytes = [0_u8; 8]; + vcpu.get_one_reg(SYS_CNTPCT_EL0, &mut reg_bytes).unwrap(); + let counter_value = u64::from_le_bytes(reg_bytes); + + // We are reading the SYS_CNTPCT_EL0 right after resetting it. + // If reset did happen successfully, the value should be quite small when we read it. + // If the reset did not happen, the value will be same as on the host and it surely + // will be more that MAX_VALUE. + let max_value = 1000; + + assert!(counter_value < max_value); + } } #[test] diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 6a59fe35a83..4b43e67541f 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -814,16 +814,16 @@ pub fn configure_system_for_boot( cpu_config, }; - // Configure vCPUs with normalizing and setting the generated CPU configuration. - for vcpu in vcpus.iter_mut() { - vcpu.kvm_vcpu - .configure(vmm.guest_memory(), entry_addr, &vcpu_config) - .map_err(VmmError::VcpuConfigure) - .map_err(Internal)?; - } - #[cfg(target_arch = "x86_64")] { + // Configure vCPUs with normalizing and setting the generated CPU configuration. + for vcpu in vcpus.iter_mut() { + vcpu.kvm_vcpu + .configure(vmm.guest_memory(), entry_addr, &vcpu_config) + .map_err(VmmError::VcpuConfigure) + .map_err(Internal)?; + } + // Write the kernel command line to guest memory. This is x86_64 specific, since on // aarch64 the command line will be specified through the FDT. let cmdline_size = boot_cmdline @@ -858,6 +858,19 @@ pub fn configure_system_for_boot( } #[cfg(target_arch = "aarch64")] { + let optional_capabilities = vmm.kvm.optional_capabilities(); + // Configure vCPUs with normalizing and setting the generated CPU configuration. + for vcpu in vcpus.iter_mut() { + vcpu.kvm_vcpu + .configure( + vmm.guest_memory(), + entry_addr, + &vcpu_config, + &optional_capabilities, + ) + .map_err(VmmError::VcpuConfigure) + .map_err(Internal)?; + } let vcpu_mpidr = vcpus .iter_mut() .map(|cpu| cpu.kvm_vcpu.get_mpidr()) diff --git a/src/vmm/src/vstate/kvm.rs b/src/vmm/src/vstate/kvm.rs index 985a9fae1b3..59b192dbe09 100644 --- a/src/vmm/src/vstate/kvm.rs +++ b/src/vmm/src/vstate/kvm.rs @@ -140,7 +140,13 @@ impl Kvm { } } } - +#[cfg(target_arch = "aarch64")] +/// Optional capabilities. +#[derive(Debug, Default)] +pub struct OptionalCapabilities { + /// KVM_CAP_COUNTER_OFFSET + pub counter_offset: bool, +} #[cfg(target_arch = "aarch64")] impl Kvm { const DEFAULT_CAPABILITIES: [u32; 7] = [ @@ -152,6 +158,16 @@ impl Kvm { kvm_bindings::KVM_CAP_MP_STATE, kvm_bindings::KVM_CAP_ONE_REG, ]; + + /// Returns struct with optional capabilities statuses. + pub fn optional_capabilities(&self) -> OptionalCapabilities { + OptionalCapabilities { + counter_offset: self + .fd + .check_extension_raw(kvm_bindings::KVM_CAP_COUNTER_OFFSET.into()) + != 0, + } + } } #[cfg(target_arch = "x86_64")] diff --git a/src/vmm/src/vstate/vcpu/aarch64.rs b/src/vmm/src/vstate/vcpu/aarch64.rs index 4097ef59044..9db1173eeb8 100644 --- a/src/vmm/src/vstate/vcpu/aarch64.rs +++ b/src/vmm/src/vstate/vcpu/aarch64.rs @@ -22,7 +22,7 @@ use crate::cpu_config::aarch64::custom_cpu_template::VcpuFeatures; use crate::cpu_config::templates::CpuConfiguration; use crate::logger::{error, IncMetric, METRICS}; use crate::vcpu::{VcpuConfig, VcpuError}; -use crate::vstate::kvm::Kvm; +use crate::vstate::kvm::{Kvm, OptionalCapabilities}; use crate::vstate::memory::{Address, GuestAddress, GuestMemoryMmap}; use crate::vstate::vcpu::VcpuEmulation; use crate::vstate::vm::Vm; @@ -116,6 +116,7 @@ impl KvmVcpu { guest_mem: &GuestMemoryMmap, kernel_load_addr: GuestAddress, vcpu_config: &VcpuConfig, + optional_capabilities: &OptionalCapabilities, ) -> Result<(), KvmVcpuError> { for reg in vcpu_config.cpu_config.regs.iter() { self.fd @@ -128,6 +129,7 @@ impl KvmVcpu { self.index, kernel_load_addr.raw_value(), guest_mem, + optional_capabilities, ) .map_err(KvmVcpuError::ConfigureRegisters)?; @@ -338,7 +340,8 @@ mod tests { #[test] fn test_configure_vcpu() { - let (_, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let optional_capabilities = kvm.optional_capabilities(); let vcpu_config = VcpuConfig { vcpu_count: 1, @@ -349,6 +352,7 @@ mod tests { &vm_mem, GuestAddress(crate::arch::get_kernel_start()), &vcpu_config, + &optional_capabilities, ) .unwrap(); @@ -358,6 +362,7 @@ mod tests { &vm_mem, GuestAddress(crate::arch::get_kernel_start()), &vcpu_config, + &optional_capabilities, ); assert_eq!( err.unwrap_err(), diff --git a/src/vmm/src/vstate/vcpu/mod.rs b/src/vmm/src/vstate/vcpu/mod.rs index 3d8877285a4..6a6471193dc 100644 --- a/src/vmm/src/vstate/vcpu/mod.rs +++ b/src/vmm/src/vstate/vcpu/mod.rs @@ -1008,6 +1008,7 @@ pub(crate) mod tests { smt: false, cpu_config: crate::cpu_config::aarch64::CpuConfiguration::default(), }, + &kvm.optional_capabilities(), ) .expect("failed to configure vcpu"); From b29ab6702bc1c3d4352324e27d4e18c8254d3563 Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Fri, 10 Jan 2025 17:19:39 +0000 Subject: [PATCH 3/5] chore: update prod-host-setup.md with arm physical counter info Update a note about physical counter on ARM being reset instead of directly passed through on kernels with `KVM_CAP_COUNTER_OFFSET` capability. Signed-off-by: Egor Lazarchuk --- docs/prod-host-setup.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/prod-host-setup.md b/docs/prod-host-setup.md index f046e31c735..cf0ac86cf1f 100644 --- a/docs/prod-host-setup.md +++ b/docs/prod-host-setup.md @@ -328,13 +328,16 @@ For vendor-specific recommendations, please consult the resources below: - ARM: [Speculative Processor Vulnerability](https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability) -##### [ARM only] Physical counter directly passed through to the guest +##### [ARM only] VM Physical counter behaviour -On ARM, the physical counter (i.e `CNTPCT`) it is returning the -[actual EL1 physical counter value of the host][1]. From the discussions before -merging this change [upstream][2], this seems like a conscious design decision -of the ARM code contributors, giving precedence to performance over the ability -to trap and control this in the hypervisor. +On ARM, Firecracker tries to reset the `CNTPCT` physical counter on VM boot. +This is done in order to prevent VM from reading host physical counter value. +Firecracker will only try to reset the counter if the host KVM contains +`KVM_CAP_COUNTER_OFFSET` capability. This capability is only present in kernels +containing +[this](https://lore.kernel.org/all/20230330174800.2677007-1-maz@kernel.org/) +patch series (starting from 6.4 and newer). For older kernels the counter value +will be passed through from the host. ##### Verification @@ -428,6 +431,3 @@ To validate that the change took effect, the file [^1]: Look for `GRUB_CMDLINE_LINUX` in file `/etc/default/grub` in RPM-based systems, and [this doc for Ubuntu](https://wiki.ubuntu.com/Kernel/KernelBootParameters). - -[1]: https://elixir.free-electrons.com/linux/v4.14.203/source/virt/kvm/arm/hyp/timer-sr.c#L63 -[2]: https://lists.cs.columbia.edu/pipermail/kvmarm/2017-January/023323.html From a9a7e68d535934d29b01088e025741e40f8f2925 Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Fri, 10 Jan 2025 11:37:21 +0000 Subject: [PATCH 4/5] chore: add an entry to the CHANGELOG Add an entry about physical counter reset to the CHANGELOG. Signed-off-by: Egor Lazarchuk --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 138ccd20079..00e82ee4b19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,13 @@ and this project adheres to ### Added +- [#4987](https://github.com/firecracker-microvm/firecracker/pull/4987): Reset + physical counter register (`CNTPCT_EL0`) on VM startup. This avoids VM reading + the host physical counter value. This is only possible on 6.4 and newer + kernels. For older kernels physical counter will still be passed to the guest + unmodified. See more info + [here](https://github.com/firecracker-microvm/firecracker/blob/main/docs/prod-host-setup.md#arm-only-vm-physical-counter-behaviour) + ### Changed - [#4913](https://github.com/firecracker-microvm/firecracker/pull/4913): Removed From d59728f37c196031f40de4d5930dc79c62d82fb2 Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Mon, 13 Jan 2025 17:37:52 +0000 Subject: [PATCH 5/5] test: add integration test for physical counter reset Add a test to verify the reset of the physical counter on aarch64 VMs. To do this we check registers saved in the snapshot and verify the counter value is less than some reasonably small number we choose. The value is based on the observation of how much cycles it takes for a VM to boot and be snapshotted. The idea is that this value will always be smaller than the actual physical counter on the host. Signed-off-by: Egor Lazarchuk --- .../functional/test_snapshot_basic.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index 4030bb4e981..875ef77dbaf 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -5,6 +5,7 @@ import filecmp import logging import os +import platform import re import shutil import time @@ -12,8 +13,11 @@ import pytest +import host_tools.cargo_build as host import host_tools.drive as drive_tools +from framework import utils from framework.microvm import SnapshotType +from framework.properties import global_props from framework.utils import check_filesystem, check_output from framework.utils_vsock import ( ECHO_SERVER_PORT, @@ -540,3 +544,56 @@ def test_vmgenid(guest_kernel_linux_6_1, rootfs, microvm_factory, snapshot_type) # Update the base for next iteration base_snapshot = snapshot + + +# TODO add `global_props.host_os == "amzn2"` condition +# once amazon linux kernels have patches. +@pytest.mark.skipif( + platform.machine() != "aarch64" or global_props.host_linux_version_tpl < (6, 4), + reason="This is aarch64 specific test and should only be run on 6.4 and later kernels", +) +def test_physical_couter_reset_aarch64(uvm_nano): + """ + Test that the CNTPCT_EL0 register is reset on VM boot. + We assume the smallest VM will not consume more than + some MAX_VALUE cycles to be created and snapshotted. + The MAX_VALUE is selected by doing a manual run of this test and + seeing what the actual counter value is. The assumption here is that + if resetting will not occur the guest counter value will be huge as it + will be a copy of host value. The host value in its turn will be huge because + it will include host OS boot + CI prep + other CI tests ... + """ + vm = uvm_nano + vm.add_net_iface() + vm.start() + + snapshot = vm.snapshot_full() + vm.kill() + snap_editor = host.get_binary("snapshot-editor") + + cntpct_el0 = hex(0x603000000013DF01) + # If a CPU runs at 3GHz, it will have a counter value of 1_000_000_000 + # in 1/3 of a second. The host surely will run for more than 1/3 second before + # executing this test. + max_value = 800_000_000 + + cmd = [ + str(snap_editor), + "info-vmstate", + "vcpu-states", + "--vmstate-path", + str(snapshot.vmstate), + ] + _, stdout, _ = utils.check_output(cmd) + + # The output will look like this: + # kvm_mp_state: 0x0 + # mpidr: 0x80000000 + # 0x6030000000100000 0x0000000e0 + # 0x6030000000100002 0xffff00fe33c0 + for line in stdout.splitlines(): + parts = line.split() + if len(parts) == 2: + reg_id, reg_value = parts + if reg_id == cntpct_el0: + assert int(reg_value, 16) < max_value