diff --git a/src/vmm/benches/memory_access.rs b/src/vmm/benches/memory_access.rs index 4dfb9dc9213..fe4f138db2d 100644 --- a/src/vmm/benches/memory_access.rs +++ b/src/vmm/benches/memory_access.rs @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 use criterion::{BatchSize, Criterion, criterion_group, criterion_main}; -use vm_memory::GuestMemory; use vmm::resources::VmResources; use vmm::vmm_config::machine_config::{HugePageConfig, MachineConfig}; @@ -14,7 +13,7 @@ fn bench_single_page_fault(c: &mut Criterion, configuration: VmResources) { // Get a pointer to the first memory region (cannot do `.get_slice(GuestAddress(0), // 1)`, because on ARM64 guest memory does not start at physical // address 0). - let ptr = memory.iter().next().unwrap().as_ptr(); + let ptr = memory.first().unwrap().as_ptr(); // fine to return both here, because ptr is not a reference into `memory` (e.g. no // self-referential structs are happening here) diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 59bca9eadfd..0b5c5edcbde 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -188,7 +188,9 @@ mod tests { use crate::acpi::{AcpiError, AcpiTableWriter}; use crate::arch::x86_64::layout::{SYSTEM_MEM_SIZE, SYSTEM_MEM_START}; use crate::builder::tests::default_vmm; - use crate::test_utils::arch_mem; + use crate::device_manager::resources::ResourceAllocator; + use crate::utils::u64_to_usize; + use crate::vstate::vm::tests::setup_vm_with_memory; struct MockSdt(Vec); @@ -215,7 +217,7 @@ mod tests { // A mocke Vmm object with 128MBs of memory let mut vmm = default_vmm(); let mut writer = AcpiTableWriter { - mem: &vmm.guest_memory, + mem: vmm.vm.guest_memory(), resource_allocator: &mut vmm.resource_allocator, }; @@ -263,15 +265,10 @@ mod tests { // change in the future. #[test] fn test_write_acpi_table_small_memory() { - let mut vmm = default_vmm(); - vmm.guest_memory = arch_mem( - (SYSTEM_MEM_START + SYSTEM_MEM_SIZE - 4096) - .try_into() - .unwrap(), - ); + let (_, vm) = setup_vm_with_memory(u64_to_usize(SYSTEM_MEM_START + SYSTEM_MEM_SIZE - 4096)); let mut writer = AcpiTableWriter { - mem: &vmm.guest_memory, - resource_allocator: &mut vmm.resource_allocator, + mem: vm.guest_memory(), + resource_allocator: &mut ResourceAllocator::new().unwrap(), }; let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE).unwrap()]); diff --git a/src/vmm/src/arch/aarch64/kvm.rs b/src/vmm/src/arch/aarch64/kvm.rs index 12720f5066f..ed66d722970 100644 --- a/src/vmm/src/arch/aarch64/kvm.rs +++ b/src/vmm/src/arch/aarch64/kvm.rs @@ -22,8 +22,6 @@ pub struct OptionalCapabilities { pub struct Kvm { /// KVM fd. pub fd: KvmFd, - /// Maximum number of memory slots allowed by KVM. - pub max_memslots: usize, /// Additional capabilities that were specified in cpu template. pub kvm_cap_modifiers: Vec, } @@ -42,12 +40,10 @@ impl Kvm { /// Initialize [`Kvm`] type for Aarch64 architecture pub fn init_arch( fd: KvmFd, - max_memslots: usize, kvm_cap_modifiers: Vec, ) -> Result { Ok(Self { fd, - max_memslots, kvm_cap_modifiers, }) } diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index e19f496f019..ead827c08c4 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -32,7 +32,7 @@ use crate::utils::{align_up, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; use crate::vstate::vcpu::KvmVcpuError; -use crate::{Vcpu, VcpuConfig, Vmm}; +use crate::{Vcpu, VcpuConfig, Vmm, logger}; /// Errors thrown while configuring aarch64 system. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -58,9 +58,35 @@ pub const MMIO_MEM_SIZE: u64 = layout::DRAM_MEM_START - layout::MAPPED_IO_START; /// Returns a Vec of the valid memory addresses for aarch64. /// See [`layout`](layout) module for a drawing of the specific memory model for this platform. -pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { - let dram_size = min(size, layout::DRAM_MEM_MAX_SIZE); - vec![(GuestAddress(layout::DRAM_MEM_START), dram_size)] +/// +/// The `offset` parameter specified the offset from [`layout::DRAM_MEM_START`]. +pub fn arch_memory_regions(offset: usize, size: usize) -> Vec<(GuestAddress, usize)> { + assert!(size > 0, "Attempt to allocate guest memory of length 0"); + assert!( + offset.checked_add(size).is_some(), + "Attempt to allocate guest memory such that the address space would wrap around" + ); + assert!( + offset < layout::DRAM_MEM_MAX_SIZE, + "offset outside allowed DRAM range" + ); + + let dram_size = min(size, layout::DRAM_MEM_MAX_SIZE - offset); + + if dram_size != size { + logger::warn!( + "Requested offset/memory size {}/{} exceeds architectural maximum (1022GiB). Size has \ + been truncated to {}", + offset, + size, + dram_size + ); + } + + vec![( + GuestAddress(layout::DRAM_MEM_START + offset as u64), + dram_size, + )] } /// Configures the system for booting Linux. @@ -89,7 +115,7 @@ pub fn configure_system_for_boot( // Configure vCPUs with normalizing and setting the generated CPU configuration. for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu.configure( - &vmm.guest_memory, + vmm.vm.guest_memory(), entry_point, &vcpu_config, &optional_capabilities, @@ -105,7 +131,7 @@ pub fn configure_system_for_boot( .expect("Cannot create cstring from cmdline string"); let fdt = fdt::create_fdt( - &vmm.guest_memory, + vmm.vm.guest_memory(), vcpu_mpidr, cmdline, vmm.mmio_device_manager.get_device_info(), @@ -114,8 +140,10 @@ pub fn configure_system_for_boot( initrd, )?; - let fdt_address = GuestAddress(get_fdt_addr(&vmm.guest_memory)); - vmm.guest_memory.write_slice(fdt.as_slice(), fdt_address)?; + let fdt_address = GuestAddress(get_fdt_addr(vmm.vm.guest_memory())); + vmm.vm + .guest_memory() + .write_slice(fdt.as_slice(), fdt_address)?; Ok(()) } @@ -182,6 +210,45 @@ pub fn load_kernel( }) } +#[cfg(kani)] +mod verification { + use vm_memory::GuestAddress; + + use crate::arch::aarch64::layout; + use crate::arch::arch_memory_regions; + + #[kani::proof] + #[kani::unwind(3)] + fn verify_arch_memory_regions() { + let offset: u64 = kani::any::(); + let len: u64 = kani::any::(); + + kani::assume(len > 0); + kani::assume(offset.checked_add(len).is_some()); + kani::assume(offset < layout::DRAM_MEM_MAX_SIZE as u64); + + let regions = arch_memory_regions(offset as usize, len as usize); + + // No MMIO gap on ARM + assert_eq!(regions.len(), 1); + + let (GuestAddress(start), actual_len) = regions[0]; + let actual_len = actual_len as u64; + + assert_eq!(start, layout::DRAM_MEM_START + offset); + assert!(actual_len <= layout::DRAM_MEM_MAX_SIZE as u64); + assert!(actual_len <= len); + + if actual_len < len { + assert_eq!( + start + actual_len, + layout::DRAM_MEM_START + layout::DRAM_MEM_MAX_SIZE as u64 + ); + assert!(offset + len >= layout::DRAM_MEM_MAX_SIZE as u64); + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -189,7 +256,7 @@ mod tests { #[test] fn test_regions_lt_1024gb() { - let regions = arch_memory_regions(1usize << 29); + let regions = arch_memory_regions(0, 1usize << 29); assert_eq!(1, regions.len()); assert_eq!(GuestAddress(super::layout::DRAM_MEM_START), regions[0].0); assert_eq!(1usize << 29, regions[0].1); @@ -197,7 +264,7 @@ mod tests { #[test] fn test_regions_gt_1024gb() { - let regions = arch_memory_regions(1usize << 41); + let regions = arch_memory_regions(0, 1usize << 41); assert_eq!(1, regions.len()); assert_eq!(GuestAddress(super::layout::DRAM_MEM_START), regions[0].0); assert_eq!(super::layout::DRAM_MEM_MAX_SIZE, regions[0].1); diff --git a/src/vmm/src/arch/aarch64/vcpu.rs b/src/vmm/src/arch/aarch64/vcpu.rs index 82bbd499deb..6a418f59b5e 100644 --- a/src/vmm/src/arch/aarch64/vcpu.rs +++ b/src/vmm/src/arch/aarch64/vcpu.rs @@ -502,27 +502,26 @@ mod tests { use crate::test_utils::arch_mem; use crate::vcpu::VcpuConfig; use crate::vstate::kvm::Kvm; - use crate::vstate::memory::GuestMemoryMmap; use crate::vstate::vm::Vm; use crate::vstate::vm::tests::setup_vm_with_memory; - fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, KvmVcpu, GuestMemoryMmap) { - let (kvm, mut vm, mut vcpu, vm_mem) = setup_vcpu_no_init(mem_size); + fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, KvmVcpu) { + let (kvm, mut vm, mut vcpu) = setup_vcpu_no_init(mem_size); vcpu.init(&[]).unwrap(); vm.setup_irqchip(1).unwrap(); - (kvm, vm, vcpu, vm_mem) + (kvm, vm, vcpu) } - fn setup_vcpu_no_init(mem_size: usize) -> (Kvm, Vm, KvmVcpu, GuestMemoryMmap) { - let (kvm, vm, vm_mem) = setup_vm_with_memory(mem_size); + fn setup_vcpu_no_init(mem_size: usize) -> (Kvm, Vm, KvmVcpu) { + let (kvm, vm) = setup_vm_with_memory(mem_size); let vcpu = KvmVcpu::new(0, &vm).unwrap(); - (kvm, vm, vcpu, vm_mem) + (kvm, vm, vcpu) } #[test] fn test_create_vcpu() { - let (_, vm, _) = setup_vm_with_memory(0x1000); + let (_, vm) = setup_vm_with_memory(0x1000); unsafe { libc::close(vm.fd().as_raw_fd()) }; @@ -538,7 +537,7 @@ mod tests { #[test] fn test_configure_vcpu() { - let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let (kvm, vm, mut vcpu) = setup_vcpu(0x10000); let optional_capabilities = kvm.optional_capabilities(); let vcpu_config = VcpuConfig { @@ -548,7 +547,7 @@ mod tests { }; vcpu.configure( - &vm_mem, + vm.guest_memory(), EntryPoint { entry_addr: GuestAddress(crate::arch::get_kernel_start()), protocol: BootProtocol::LinuxBoot, @@ -561,7 +560,7 @@ mod tests { unsafe { libc::close(vcpu.fd.as_raw_fd()) }; let err = vcpu.configure( - &vm_mem, + vm.guest_memory(), EntryPoint { entry_addr: GuestAddress(crate::arch::get_kernel_start()), protocol: BootProtocol::LinuxBoot, @@ -583,7 +582,7 @@ mod tests { #[test] fn test_init_vcpu() { - let (_, mut vm, _) = setup_vm_with_memory(0x1000); + let (_, mut vm) = setup_vm_with_memory(0x1000); let mut vcpu = KvmVcpu::new(0, &vm).unwrap(); vm.setup_irqchip(1).unwrap(); @@ -602,7 +601,7 @@ mod tests { #[test] fn test_vcpu_save_restore_state() { - let (_, mut vm, _) = setup_vm_with_memory(0x1000); + let (_, mut vm) = setup_vm_with_memory(0x1000); let mut vcpu = KvmVcpu::new(0, &vm).unwrap(); vm.setup_irqchip(1).unwrap(); @@ -646,7 +645,7 @@ mod tests { // // This should fail with ENOEXEC. // https://elixir.bootlin.com/linux/v5.10.176/source/arch/arm64/kvm/arm.c#L1165 - let (_, mut vm, _) = setup_vm_with_memory(0x1000); + let (_, mut vm) = setup_vm_with_memory(0x1000); let vcpu = KvmVcpu::new(0, &vm).unwrap(); vm.setup_irqchip(1).unwrap(); @@ -656,7 +655,7 @@ mod tests { #[test] fn test_dump_cpu_config_after_init() { // Test `dump_cpu_config()` after `KVM_VCPU_INIT`. - let (_, mut vm, _) = setup_vm_with_memory(0x1000); + let (_, mut vm) = setup_vm_with_memory(0x1000); let mut vcpu = KvmVcpu::new(0, &vm).unwrap(); vm.setup_irqchip(1).unwrap(); vcpu.init(&[]).unwrap(); @@ -666,7 +665,7 @@ mod tests { #[test] fn test_setup_non_boot_vcpu() { - let (_, vm, _) = setup_vm_with_memory(0x1000); + let (_, vm) = setup_vm_with_memory(0x1000); let mut vcpu1 = KvmVcpu::new(0, &vm).unwrap(); vcpu1.init(&[]).unwrap(); let mut vcpu2 = KvmVcpu::new(1, &vm).unwrap(); @@ -678,7 +677,7 @@ mod tests { // Test `get_regs()` with valid register IDs. // - X0: 0x6030 0000 0010 0000 // - X1: 0x6030 0000 0010 0002 - let (_, _, vcpu, _) = setup_vcpu(0x10000); + let (_, _, vcpu) = setup_vcpu(0x10000); let reg_list = Vec::::from([0x6030000000100000, 0x6030000000100002]); get_registers(&vcpu.fd, ®_list, &mut Aarch64RegisterVec::default()).unwrap(); } @@ -686,14 +685,14 @@ mod tests { #[test] fn test_get_invalid_regs() { // Test `get_regs()` with invalid register IDs. - let (_, _, vcpu, _) = setup_vcpu(0x10000); + let (_, _, vcpu) = setup_vcpu(0x10000); let reg_list = Vec::::from([0x6030000000100001, 0x6030000000100003]); get_registers(&vcpu.fd, ®_list, &mut Aarch64RegisterVec::default()).unwrap_err(); } #[test] fn test_setup_regs() { - let (kvm, _, vcpu, _) = setup_vcpu_no_init(0x10000); + let (kvm, _, vcpu) = setup_vcpu_no_init(0x10000); let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let optional_capabilities = kvm.optional_capabilities(); @@ -730,7 +729,7 @@ mod tests { #[test] fn test_read_mpidr() { - let (_, _, vcpu, _) = setup_vcpu_no_init(0x10000); + let (_, _, vcpu) = setup_vcpu_no_init(0x10000); // Must fail when vcpu is not initialized yet. let res = vcpu.get_mpidr(); @@ -745,7 +744,7 @@ mod tests { #[test] fn test_get_set_regs() { - let (_, _, vcpu, _) = setup_vcpu_no_init(0x10000); + let (_, _, vcpu) = setup_vcpu_no_init(0x10000); // Must fail when vcpu is not initialized yet. let mut regs = Aarch64RegisterVec::default(); @@ -763,7 +762,7 @@ mod tests { fn test_mpstate() { use std::os::unix::io::AsRawFd; - let (_, _, vcpu, _) = setup_vcpu(0x10000); + let (_, _, vcpu) = setup_vcpu(0x10000); let res = vcpu.get_mpstate(); vcpu.set_mpstate(res.unwrap()).unwrap(); diff --git a/src/vmm/src/arch/aarch64/vm.rs b/src/vmm/src/arch/aarch64/vm.rs index d1b6f07663f..e54723f5b6d 100644 --- a/src/vmm/src/arch/aarch64/vm.rs +++ b/src/vmm/src/arch/aarch64/vm.rs @@ -1,18 +1,18 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use kvm_ioctls::VmFd; use serde::{Deserialize, Serialize}; use crate::Kvm; use crate::arch::aarch64::gic::GicState; -use crate::vstate::vm::VmError; +use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; +use crate::vstate::vm::{VmCommon, VmError}; /// Structure representing the current architecture's understand of what a "virtual machine" is. #[derive(Debug)] pub struct ArchVm { - /// KVM file descriptor of microVM - pub fd: VmFd, + /// Architecture independent parts of a vm. + pub common: VmCommon, // On aarch64 we need to keep around the fd obtained by creating the VGIC device. irqchip_handle: Option, } @@ -31,9 +31,9 @@ pub enum ArchVmError { impl ArchVm { /// Create a new `Vm` struct. pub fn new(kvm: &Kvm) -> Result { - let fd = Self::create_vm(kvm)?; + let common = Self::create_common(kvm)?; Ok(ArchVm { - fd, + common, irqchip_handle: None, }) } @@ -55,7 +55,7 @@ impl ArchVm { /// Creates the GIC (Global Interrupt Controller). pub fn setup_irqchip(&mut self, vcpu_count: u8) -> Result<(), ArchVmError> { self.irqchip_handle = Some( - crate::arch::aarch64::gic::create_gic(&self.fd, vcpu_count.into(), None) + crate::arch::aarch64::gic::create_gic(self.fd(), vcpu_count.into(), None) .map_err(ArchVmError::VmCreateGIC)?, ); Ok(()) @@ -69,6 +69,7 @@ impl ArchVm { /// Saves and returns the Kvm Vm state. pub fn save_state(&self, mpidrs: &[u64]) -> Result { Ok(VmState { + memory: self.common.guest_memory.describe(), gic: self .get_irqchip() .save_device(mpidrs) @@ -85,6 +86,7 @@ impl ArchVm { self.get_irqchip() .restore_device(mpidrs, &state.gic) .map_err(ArchVmError::RestoreGic)?; + Ok(()) } } @@ -92,6 +94,8 @@ impl ArchVm { /// Structure holding an general specific VM state. #[derive(Debug, Default, Serialize, Deserialize)] pub struct VmState { + /// Guest memory state + pub memory: GuestMemoryState, /// GIC state. pub gic: GicState, } diff --git a/src/vmm/src/arch/x86_64/kvm.rs b/src/vmm/src/arch/x86_64/kvm.rs index 4d77b0fdbb6..bad7f134d58 100644 --- a/src/vmm/src/arch/x86_64/kvm.rs +++ b/src/vmm/src/arch/x86_64/kvm.rs @@ -21,8 +21,6 @@ pub enum KvmArchError { pub struct Kvm { /// KVM fd. pub fd: KvmFd, - /// Maximum number of memory slots allowed by KVM. - pub max_memslots: usize, /// Additional capabilities that were specified in cpu template. pub kvm_cap_modifiers: Vec, /// Supported CpuIds. @@ -50,7 +48,6 @@ impl Kvm { /// Initialize [`Kvm`] type for x86_64 architecture pub fn init_arch( fd: KvmFd, - max_memslots: usize, kvm_cap_modifiers: Vec, ) -> Result { request_dynamic_xstate_features().map_err(KvmArchError::XstateFeatures)?; @@ -61,7 +58,6 @@ impl Kvm { Ok(Kvm { fd, - max_memslots, kvm_cap_modifiers, supported_cpuid, }) diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index fdfce03b069..ca350cbf9af 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -110,17 +110,33 @@ pub const MMIO_MEM_SIZE: u64 = MEM_32BIT_GAP_SIZE; /// These should be used to configure the GuestMemoryMmap structure for the platform. /// For x86_64 all addresses are valid from the start of the kernel except a /// carve out at the end of 32bit address space. -pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { +pub fn arch_memory_regions(offset: usize, size: usize) -> Vec<(GuestAddress, usize)> { + // If we get here with size == 0 something has seriously gone wrong. Firecracker should never + // try to allocate guest memory of size 0 + assert!(size > 0, "Attempt to allocate guest memory of length 0"); + assert!( + offset.checked_add(size).is_some(), + "Attempt to allocate guest memory such that the address space would wrap around" + ); + // It's safe to cast MMIO_MEM_START to usize because it fits in a u32 variable // (It points to an address in the 32 bit space). - match size.checked_sub(usize::try_from(MMIO_MEM_START).unwrap()) { + match (size + offset).checked_sub(u64_to_usize(MMIO_MEM_START)) { // case1: guest memory fits before the gap - None | Some(0) => vec![(GuestAddress(0), size)], - // case2: guest memory extends beyond the gap - Some(remaining) => vec![ - (GuestAddress(0), usize::try_from(MMIO_MEM_START).unwrap()), + None | Some(0) => vec![(GuestAddress(offset as u64), size)], + // case2: starts before the gap, but doesn't completely fit + Some(remaining) if (offset as u64) < MMIO_MEM_START => vec![ + ( + GuestAddress(offset as u64), + u64_to_usize(MMIO_MEM_START) - offset, + ), (GuestAddress(FIRST_ADDR_PAST_32BITS), remaining), ], + // case3: guest memory start after the gap + Some(_) => vec![( + GuestAddress(FIRST_ADDR_PAST_32BITS.max(offset as u64)), + size, + )], } } @@ -169,7 +185,7 @@ pub fn configure_system_for_boot( // Configure vCPUs with normalizing and setting the generated CPU configuration. for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu - .configure(&vmm.guest_memory, entry_point, &vcpu_config)?; + .configure(vmm.vm.guest_memory(), entry_point, &vcpu_config)?; } // Write the kernel command line to guest memory. This is x86_64 specific, since on @@ -180,7 +196,7 @@ pub fn configure_system_for_boot( .expect("Cannot create cstring from cmdline string"); load_cmdline( - &vmm.guest_memory, + vmm.vm.guest_memory(), GuestAddress(crate::arch::x86_64::layout::CMDLINE_START), &boot_cmdline, ) @@ -188,7 +204,7 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( - &vmm.guest_memory, + vmm.vm.guest_memory(), &mut vmm.resource_allocator, vcpu_config.vcpu_count, ) @@ -196,11 +212,11 @@ pub fn configure_system_for_boot( match entry_point.protocol { BootProtocol::PvhBoot => { - configure_pvh(&vmm.guest_memory, GuestAddress(CMDLINE_START), initrd)?; + configure_pvh(vmm.vm.guest_memory(), GuestAddress(CMDLINE_START), initrd)?; } BootProtocol::LinuxBoot => { configure_64bit_boot( - &vmm.guest_memory, + vmm.vm.guest_memory(), GuestAddress(CMDLINE_START), cmdline_size, initrd, @@ -211,7 +227,7 @@ pub fn configure_system_for_boot( // Create ACPI tables and write them in guest memory // For the time being we only support ACPI in x86_64 create_acpi_tables( - &vmm.guest_memory, + vmm.vm.guest_memory(), &mut vmm.resource_allocator, &vmm.mmio_device_manager, &vmm.acpi_device_manager, @@ -456,6 +472,56 @@ pub fn load_kernel( }) } +#[cfg(kani)] +mod verification { + use crate::arch::x86_64::FIRST_ADDR_PAST_32BITS; + use crate::arch::{MMIO_MEM_START, arch_memory_regions}; + + #[kani::proof] + #[kani::unwind(3)] + fn verify_arch_memory_regions() { + let offset: u64 = kani::any::(); + let len: u64 = kani::any::(); + + kani::assume(len > 0); + kani::assume(offset.checked_add(len).is_some()); + + let regions = arch_memory_regions(offset as usize, len as usize); + + // There's only one MMIO gap, so we can get either 1 or 2 regions + assert!(regions.len() <= 2); + assert!(regions.len() >= 1); + + // The total length of all regions is what we requested + assert_eq!( + regions.iter().map(|&(_, len)| len).sum::(), + len as usize + ); + + // No region overlaps the MMIO gap + assert!( + regions + .iter() + .all(|&(start, len)| start.0 >= FIRST_ADDR_PAST_32BITS + || start.0 + len as u64 <= MMIO_MEM_START) + ); + + // All regions start after our specified offset + assert!(regions.iter().all(|&(start, _)| start.0 >= offset as u64)); + + // All regions have non-zero length + assert!(regions.iter().all(|&(_, len)| len > 0)); + + // If there's two regions, they perfectly snuggle up to the MMIO gap + if regions.len() == 2 { + kani::cover!(); + + assert_eq!(regions[0].0.0 + regions[0].1 as u64, MMIO_MEM_START); + assert_eq!(regions[1].0.0, FIRST_ADDR_PAST_32BITS); + } + } +} + #[cfg(test)] mod tests { use linux_loader::loader::bootparam::boot_e820_entry; @@ -466,18 +532,41 @@ mod tests { #[test] fn regions_lt_4gb() { - let regions = arch_memory_regions(1usize << 29); + let regions = arch_memory_regions(0, 1usize << 29); assert_eq!(1, regions.len()); assert_eq!(GuestAddress(0), regions[0].0); assert_eq!(1usize << 29, regions[0].1); + + let regions = arch_memory_regions(1 << 28, 1 << 29); + assert_eq!(1, regions.len()); + assert_eq!(regions[0], (GuestAddress(1 << 28), 1 << 29)); } #[test] fn regions_gt_4gb() { - let regions = arch_memory_regions((1usize << 32) + 0x8000); + const MEMORY_SIZE: usize = (1 << 32) + 0x8000; + + let regions = arch_memory_regions(0, MEMORY_SIZE); assert_eq!(2, regions.len()); assert_eq!(GuestAddress(0), regions[0].0); assert_eq!(GuestAddress(1u64 << 32), regions[1].0); + + let regions = arch_memory_regions(1 << 31, MEMORY_SIZE); + assert_eq!(2, regions.len()); + assert_eq!( + regions[0], + ( + GuestAddress(1 << 31), + u64_to_usize(MMIO_MEM_START) - (1 << 31) + ) + ); + assert_eq!( + regions[1], + ( + GuestAddress(FIRST_ADDR_PAST_32BITS), + MEMORY_SIZE - regions[0].1 + ) + ) } #[test] diff --git a/src/vmm/src/arch/x86_64/vcpu.rs b/src/vmm/src/arch/x86_64/vcpu.rs index 3cf53cdefc1..6c07720b573 100644 --- a/src/vmm/src/arch/x86_64/vcpu.rs +++ b/src/vmm/src/arch/x86_64/vcpu.rs @@ -825,11 +825,11 @@ mod tests { } } - fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, KvmVcpu, GuestMemoryMmap) { - let (kvm, vm, vm_mem) = setup_vm_with_memory(mem_size); + fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, KvmVcpu) { + let (kvm, vm) = setup_vm_with_memory(mem_size); vm.setup_irqchip().unwrap(); let vcpu = KvmVcpu::new(0, &vm).unwrap(); - (kvm, vm, vcpu, vm_mem) + (kvm, vm, vcpu) } fn is_at_least_cascade_lake() -> bool { @@ -864,12 +864,12 @@ mod tests { #[test] fn test_configure_vcpu() { - let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let (kvm, vm, mut vcpu) = setup_vcpu(0x10000); let vcpu_config = create_vcpu_config(&kvm, &vcpu, &CustomCpuTemplate::default()).unwrap(); assert_eq!( vcpu.configure( - &vm_mem, + vm.guest_memory(), EntryPoint { entry_addr: GuestAddress(0), protocol: BootProtocol::LinuxBoot, @@ -886,7 +886,7 @@ mod tests { Ok(template) => match create_vcpu_config(kvm, vcpu, &template) { Ok(config) => vcpu .configure( - &vm_mem, + vm.guest_memory(), EntryPoint { entry_addr: GuestAddress(crate::arch::get_kernel_start()), protocol: BootProtocol::LinuxBoot, @@ -946,7 +946,7 @@ mod tests { #[test] fn test_vcpu_cpuid_restore() { - let (kvm, _, vcpu, _mem) = setup_vcpu(0x10000); + let (kvm, _, vcpu) = setup_vcpu(0x10000); vcpu.fd.set_cpuid2(&kvm.supported_cpuid).unwrap(); // Mutate the CPUID. @@ -964,7 +964,7 @@ mod tests { drop(vcpu); // Restore the state into a new vcpu. - let (_, _vm, vcpu, _mem) = setup_vcpu(0x10000); + let (_, _vm, vcpu) = setup_vcpu(0x10000); let result2 = vcpu.restore_state(&state); assert!(result2.is_ok(), "{}", result2.unwrap_err()); @@ -984,7 +984,7 @@ mod tests { #[test] fn test_empty_cpuid_entries_removed() { // Test that `get_cpuid()` removes zeroed empty entries from the `KVM_GET_CPUID2` result. - let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let (kvm, vm, mut vcpu) = setup_vcpu(0x10000); let vcpu_config = VcpuConfig { vcpu_count: 1, smt: false, @@ -994,7 +994,7 @@ mod tests { }, }; vcpu.configure( - &vm_mem, + vm.guest_memory(), EntryPoint { entry_addr: GuestAddress(0), protocol: BootProtocol::LinuxBoot, @@ -1042,7 +1042,7 @@ mod tests { // Since `KVM_SET_CPUID2` has not been called before vcpu configuration, all leaves should // be filled with zero. Therefore, `KvmVcpu::dump_cpu_config()` should fail with CPUID type // conversion error due to the lack of brand string info in leaf 0x0. - let (_, _, vcpu, _) = setup_vcpu(0x10000); + let (_, _, vcpu) = setup_vcpu(0x10000); match vcpu.dump_cpu_config() { Err(KvmVcpuError::ConvertCpuidType(_)) => (), Err(err) => panic!("Unexpected error: {err}"), @@ -1053,7 +1053,7 @@ mod tests { #[test] fn test_dump_cpu_config_with_configured_vcpu() { // Test `dump_cpu_config()` after vcpu configuration. - let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let (kvm, vm, mut vcpu) = setup_vcpu(0x10000); let vcpu_config = VcpuConfig { vcpu_count: 1, smt: false, @@ -1064,7 +1064,7 @@ mod tests { }; vcpu.configure( - &vm_mem, + vm.guest_memory(), EntryPoint { entry_addr: GuestAddress(0), protocol: BootProtocol::LinuxBoot, @@ -1080,7 +1080,7 @@ mod tests { fn test_is_tsc_scaling_required() { // Test `is_tsc_scaling_required` as if it were on the same // CPU model as the one in the snapshot state. - let (_, _, vcpu, _) = setup_vcpu(0x1000); + let (_, _, vcpu) = setup_vcpu(0x1000); { // The frequency difference is within tolerance. @@ -1122,7 +1122,7 @@ mod tests { #[test] fn test_set_tsc() { - let (kvm, _, vcpu, _) = setup_vcpu(0x1000); + let (kvm, _, vcpu) = setup_vcpu(0x1000); let mut state = vcpu.save_state().unwrap(); state.tsc_khz = Some( state.tsc_khz.unwrap() @@ -1147,7 +1147,7 @@ mod tests { fn test_get_msrs_with_msrs_to_save() { // Test `get_msrs()` with the MSR indices that should be serialized into snapshots. // The MSR indices should be valid and this test should succeed. - let (_, _, vcpu, _) = setup_vcpu(0x1000); + let (_, _, vcpu) = setup_vcpu(0x1000); vcpu.get_msrs(vcpu.msrs_to_save.iter().copied()).unwrap(); } @@ -1155,7 +1155,7 @@ mod tests { fn test_get_msrs_with_msrs_to_dump() { // Test `get_msrs()` with the MSR indices that should be dumped. // All the MSR indices should be valid and the call should succeed. - let (_, _, vcpu, _) = setup_vcpu(0x1000); + let (_, _, vcpu) = setup_vcpu(0x1000); let kvm = kvm_ioctls::Kvm::new().unwrap(); let msrs_to_dump = crate::arch::x86_64::msr::get_msrs_to_dump(&kvm).unwrap(); @@ -1168,7 +1168,7 @@ mod tests { // Test `get_msrs()` with unsupported MSR indices. This should return `VcpuGetMsr` error // that happens when `KVM_GET_MSRS` fails to populate MSR values in the middle and exits. // Currently, MSR indices 2..=4 are not listed as supported MSRs. - let (_, _, vcpu, _) = setup_vcpu(0x1000); + let (_, _, vcpu) = setup_vcpu(0x1000); let msr_index_list: Vec = vec![2, 3, 4]; match vcpu.get_msrs(msr_index_list.iter().copied()) { Err(KvmVcpuError::VcpuGetMsr(_)) => (), diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index ebce8ef02d0..e84b4338e35 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -7,12 +7,13 @@ use kvm_bindings::{ KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, KVM_PIT_SPEAKER_DUMMY, MsrList, kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, }; -use kvm_ioctls::{Cap, VmFd}; +use kvm_ioctls::Cap; use serde::{Deserialize, Serialize}; use crate::arch::x86_64::msr::MsrError; use crate::utils::u64_to_usize; -use crate::vstate::vm::VmError; +use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; +use crate::vstate::vm::{VmCommon, VmError}; /// Error type for [`Vm::restore_state`] #[allow(missing_docs)] @@ -48,8 +49,8 @@ pub enum ArchVmError { /// Structure representing the current architecture's understand of what a "virtual machine" is. #[derive(Debug)] pub struct ArchVm { - /// KVM file descriptor of microVM - pub fd: VmFd, + /// Architecture independent parts of a vm + pub common: VmCommon, msrs_to_save: MsrList, /// Size in bytes requiring to hold the dynamically-sized `kvm_xsave` struct. /// @@ -60,7 +61,7 @@ pub struct ArchVm { impl ArchVm { /// Create a new `Vm` struct. pub fn new(kvm: &crate::vstate::kvm::Kvm) -> Result { - let fd = Self::create_vm(kvm)?; + let common = Self::create_common(kvm)?; let msrs_to_save = kvm.msrs_to_save().map_err(ArchVmError::GetMsrsToSave)?; @@ -70,7 +71,7 @@ impl ArchVm { // https://github.com/torvalds/linux/commit/be50b2065dfa3d88428fdfdc340d154d96bf6848 // // Cache the value in order not to call it at each vCPU creation. - let xsave2_size = match fd.check_extension_int(Cap::Xsave2) { + let xsave2_size = match common.fd.check_extension_int(Cap::Xsave2) { // Catch all negative values just in case although the possible negative return value // of ioctl() is only -1. ..=-1 => { @@ -84,11 +85,13 @@ impl ArchVm { ret => Some(usize::try_from(ret).unwrap()), }; - fd.set_tss_address(u64_to_usize(crate::arch::x86_64::layout::KVM_TSS_ADDRESS)) + common + .fd + .set_tss_address(u64_to_usize(crate::arch::x86_64::layout::KVM_TSS_ADDRESS)) .map_err(ArchVmError::SetTssAddress)?; Ok(ArchVm { - fd, + common, msrs_to_save, xsave2_size, }) @@ -116,19 +119,19 @@ impl ArchVm { /// - [`kvm_ioctls::VmFd::set_irqchip`] errors. /// - [`kvm_ioctls::VmFd::set_irqchip`] errors. pub fn restore_state(&mut self, state: &VmState) -> Result<(), ArchVmError> { - self.fd + self.fd() .set_pit2(&state.pitstate) .map_err(ArchVmError::SetPit2)?; - self.fd + self.fd() .set_clock(&state.clock) .map_err(ArchVmError::SetClock)?; - self.fd + self.fd() .set_irqchip(&state.pic_master) .map_err(ArchVmError::SetIrqChipPicMaster)?; - self.fd + self.fd() .set_irqchip(&state.pic_slave) .map_err(ArchVmError::SetIrqChipPicSlave)?; - self.fd + self.fd() .set_irqchip(&state.ioapic) .map_err(ArchVmError::SetIrqChipIoAPIC)?; Ok(()) @@ -136,7 +139,7 @@ impl ArchVm { /// Creates the irq chip and an in-kernel device model for the PIT. pub fn setup_irqchip(&self) -> Result<(), ArchVmError> { - self.fd + self.fd() .create_irq_chip() .map_err(ArchVmError::VmSetIrqChip)?; // We need to enable the emulation of a dummy speaker port stub so that writing to port 0x61 @@ -145,16 +148,16 @@ impl ArchVm { flags: KVM_PIT_SPEAKER_DUMMY, ..Default::default() }; - self.fd + self.fd() .create_pit2(pit_config) .map_err(ArchVmError::VmSetIrqChip) } /// Saves and returns the Kvm Vm state. pub fn save_state(&self) -> Result { - let pitstate = self.fd.get_pit2().map_err(ArchVmError::VmGetPit2)?; + let pitstate = self.fd().get_pit2().map_err(ArchVmError::VmGetPit2)?; - let mut clock = self.fd.get_clock().map_err(ArchVmError::VmGetClock)?; + let mut clock = self.fd().get_clock().map_err(ArchVmError::VmGetClock)?; // This bit is not accepted in SET_CLOCK, clear it. clock.flags &= !KVM_CLOCK_TSC_STABLE; @@ -162,7 +165,7 @@ impl ArchVm { chip_id: KVM_IRQCHIP_PIC_MASTER, ..Default::default() }; - self.fd + self.fd() .get_irqchip(&mut pic_master) .map_err(ArchVmError::VmGetIrqChip)?; @@ -170,7 +173,7 @@ impl ArchVm { chip_id: KVM_IRQCHIP_PIC_SLAVE, ..Default::default() }; - self.fd + self.fd() .get_irqchip(&mut pic_slave) .map_err(ArchVmError::VmGetIrqChip)?; @@ -178,11 +181,12 @@ impl ArchVm { chip_id: KVM_IRQCHIP_IOAPIC, ..Default::default() }; - self.fd + self.fd() .get_irqchip(&mut ioapic) .map_err(ArchVmError::VmGetIrqChip)?; Ok(VmState { + memory: self.common.guest_memory.describe(), pitstate, clock, pic_master, @@ -205,6 +209,8 @@ impl ArchVm { #[derive(Default, Deserialize, Serialize)] /// Structure holding VM kvm state. pub struct VmState { + /// guest memory state + pub memory: GuestMemoryState, pitstate: kvm_pit_state2, clock: kvm_clock_data, // TODO: rename this field to adopt inclusive language once Linux updates it, too. @@ -244,7 +250,7 @@ mod tests { // Irqchips, clock and pitstate are not configured so trying to save state should fail. vm.save_state().unwrap_err(); - let (_, vm, _mem) = setup_vm_with_memory(0x1000); + let (_, vm) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); let vm_state = vm.save_state().unwrap(); @@ -257,7 +263,7 @@ mod tests { assert_eq!(vm_state.pic_slave.chip_id, KVM_IRQCHIP_PIC_SLAVE); assert_eq!(vm_state.ioapic.chip_id, KVM_IRQCHIP_IOAPIC); - let (_, mut vm, _mem) = setup_vm_with_memory(0x1000); + let (_, mut vm) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); vm.restore_state(&vm_state).unwrap(); @@ -268,11 +274,11 @@ mod tests { fn test_vm_save_restore_state_bad_irqchip() { use kvm_bindings::KVM_NR_IRQCHIPS; - let (_, vm, _mem) = setup_vm_with_memory(0x1000); + let (_, vm) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); let mut vm_state = vm.save_state().unwrap(); - let (_, mut vm, _mem) = setup_vm_with_memory(0x1000); + let (_, mut vm) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); // Try to restore an invalid PIC Master chip ID @@ -297,7 +303,7 @@ mod tests { fn test_vmstate_serde() { let mut snapshot_data = vec![0u8; 10000]; - let (_, mut vm, _) = setup_vm_with_memory(0x1000); + let (_, mut vm) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); let state = vm.save_state().unwrap(); Snapshot::serialize(&mut snapshot_data.as_mut_slice(), &state).unwrap(); diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 9b5924426a6..27ba3e96fa9 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -57,7 +57,7 @@ use crate::snapshot::Persist; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; use crate::vstate::kvm::Kvm; -use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::memory::GuestRegionMmap; use crate::vstate::vcpu::{Vcpu, VcpuError}; use crate::vstate::vm::Vm; use crate::{EventManager, Vmm, VmmError, device_manager}; @@ -134,8 +134,6 @@ impl std::convert::From for StartMicrovmError { fn create_vmm_and_vcpus( instance_info: &InstanceInfo, event_manager: &mut EventManager, - guest_memory: GuestMemoryMmap, - uffd: Option, vcpu_count: u8, kvm_capabilities: Vec, ) -> Result<(Vmm, Vec), VmmError> { @@ -143,8 +141,6 @@ fn create_vmm_and_vcpus( // Set up Kvm Vm and register memory regions. // Build custom CPU config if a custom template is provided. let mut vm = Vm::new(&kvm)?; - kvm.check_memory(&guest_memory)?; - vm.memory_init(&guest_memory)?; let resource_allocator = ResourceAllocator::new()?; @@ -180,8 +176,7 @@ fn create_vmm_and_vcpus( shutdown_exit_code: None, kvm, vm, - guest_memory, - uffd, + uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, resource_allocator, @@ -220,8 +215,6 @@ pub fn build_microvm_for_boot( .allocate_guest_memory() .map_err(StartMicrovmError::GuestMemory)?; - let entry_point = load_kernel(&boot_config.kernel_file, &guest_memory)?; - let initrd = InitrdConfig::from_config(boot_config, &guest_memory)?; // Clone the command-line so that a failed boot doesn't pollute the original. #[allow(unused_mut)] let mut boot_cmdline = boot_config.cmdline.clone(); @@ -234,12 +227,17 @@ pub fn build_microvm_for_boot( let (mut vmm, mut vcpus) = create_vmm_and_vcpus( instance_info, event_manager, - guest_memory, - None, vm_resources.machine_config.vcpu_count, cpu_template.kvm_capabilities.clone(), )?; + vmm.vm + .register_memory_regions(guest_memory) + .map_err(VmmError::Vm)?; + + let entry_point = load_kernel(&boot_config.kernel_file, vmm.vm.guest_memory())?; + let initrd = InitrdConfig::from_config(boot_config, vmm.vm.guest_memory())?; + #[cfg(feature = "gdb")] let (gdb_tx, gdb_rx) = mpsc::channel(); #[cfg(feature = "gdb")] @@ -414,7 +412,7 @@ pub fn build_microvm_from_snapshot( instance_info: &InstanceInfo, event_manager: &mut EventManager, microvm_state: MicrovmState, - guest_memory: GuestMemoryMmap, + guest_memory: Vec, uffd: Option, seccomp_filters: &BpfThreadMap, vm_resources: &mut VmResources, @@ -424,13 +422,17 @@ pub fn build_microvm_from_snapshot( let (mut vmm, mut vcpus) = create_vmm_and_vcpus( instance_info, event_manager, - guest_memory, - uffd, vm_resources.machine_config.vcpu_count, microvm_state.kvm_state.kvm_cap_modifiers.clone(), ) .map_err(StartMicrovmError::Internal)?; + vmm.vm + .register_memory_regions(guest_memory) + .map_err(VmmError::Vm) + .map_err(StartMicrovmError::Internal)?; + vmm.uffd = uffd; + #[cfg(target_arch = "x86_64")] { // Scale TSC to match, extract the TSC freq from the state if specified @@ -469,7 +471,7 @@ pub fn build_microvm_from_snapshot( // Restore devices states. let mmio_ctor_args = MMIODevManagerConstructorArgs { - mem: &vmm.guest_memory, + mem: vmm.vm.guest_memory(), vm: vmm.vm.fd(), event_manager, resource_allocator: &mut vmm.resource_allocator, @@ -485,7 +487,7 @@ pub fn build_microvm_from_snapshot( { let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { - mem: &vmm.guest_memory, + mem: vmm.vm.guest_memory(), resource_allocator: &mut vmm.resource_allocator, vm: vmm.vm.fd(), }; @@ -594,7 +596,7 @@ fn attach_virtio_device( event_manager.add_subscriber(device.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - let device = MmioTransport::new(vmm.guest_memory.clone(), device, is_vhost_user); + let device = MmioTransport::new(vmm.vm.guest_memory().clone(), device, is_vhost_user); vmm.mmio_device_manager .register_mmio_virtio_for_boot( vmm.vm.fd(), @@ -619,7 +621,7 @@ pub(crate) fn attach_boot_timer_device( } fn attach_vmgenid_device(vmm: &mut Vmm) -> Result<(), StartMicrovmError> { - let vmgenid = VmGenId::new(&vmm.guest_memory, &mut vmm.resource_allocator) + let vmgenid = VmGenId::new(vmm.vm.guest_memory(), &mut vmm.resource_allocator) .map_err(StartMicrovmError::CreateVMGenID)?; vmm.acpi_device_manager @@ -814,7 +816,7 @@ pub(crate) mod tests { } pub(crate) fn default_vmm() -> Vmm { - let (kvm, mut vm, guest_memory) = setup_vm_with_memory(mib_to_bytes(128)); + let (kvm, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); let mmio_device_manager = MMIODeviceManager::new(); let acpi_device_manager = ACPIDeviceManager::new(); @@ -842,7 +844,6 @@ pub(crate) mod tests { shutdown_exit_code: None, kvm, vm, - guest_memory, uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 7ced10dbbe5..20b008769a5 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -248,7 +248,7 @@ mod tests { #[test] fn test_register_legacy_devices() { - let (_, vm, _) = setup_vm_with_memory(0x1000); + let (_, vm) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); let mut ldm = PortIODeviceManager::new( Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 20d9b381e1c..99bde6e2e78 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -532,7 +532,7 @@ mod tests { use crate::devices::virtio::ActivateError; use crate::devices::virtio::device::{IrqTrigger, VirtioDevice}; use crate::devices::virtio::queue::Queue; - use crate::test_utils::multi_region_mem; + use crate::test_utils::multi_region_mem_raw; use crate::vstate::kvm::Kvm; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; @@ -649,10 +649,10 @@ mod tests { fn test_register_virtio_device() { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); - let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); + let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); let mut vm = Vm::new(&kvm).unwrap(); - vm.memory_init(&guest_mem).unwrap(); + vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -666,7 +666,7 @@ mod tests { device_manager .register_virtio_test_device( vm.fd(), - guest_mem, + vm.guest_memory().clone(), &mut resource_allocator, dummy, &mut cmdline, @@ -680,10 +680,10 @@ mod tests { fn test_register_too_many_devices() { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); - let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); + let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); let mut vm = Vm::new(&kvm).unwrap(); - vm.memory_init(&guest_mem).unwrap(); + vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -697,7 +697,7 @@ mod tests { device_manager .register_virtio_test_device( vm.fd(), - guest_mem.clone(), + vm.guest_memory().clone(), &mut resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, @@ -711,7 +711,7 @@ mod tests { device_manager .register_virtio_test_device( vm.fd(), - guest_mem, + vm.guest_memory().clone(), &mut resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, @@ -736,12 +736,10 @@ mod tests { fn test_device_info() { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); - let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); + let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); let mut vm = Vm::new(&kvm).unwrap(); - vm.memory_init(&guest_mem).unwrap(); - - let mem_clone = guest_mem.clone(); + vm.register_memory_regions(guest_mem).unwrap(); #[cfg(target_arch = "x86_64")] vm.setup_irqchip().unwrap(); @@ -758,7 +756,7 @@ mod tests { let addr = device_manager .register_virtio_test_device( vm.fd(), - guest_mem, + vm.guest_memory().clone(), &mut resource_allocator, dummy, &mut cmdline, @@ -794,7 +792,7 @@ mod tests { device_manager .register_virtio_test_device( vm.fd(), - mem_clone, + vm.guest_memory().clone(), &mut resource_allocator, dummy2, &mut cmdline, diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 0b8ad8eb04d..30a6387bc82 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -805,7 +805,7 @@ mod tests { let device_states: DeviceStates = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { - mem: &vmm.guest_memory, + mem: vmm.vm.guest_memory(), vm: vmm.vm.fd(), event_manager: &mut event_manager, resource_allocator: &mut resource_allocator, diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index c9907744080..b0bf5a31e3f 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -376,8 +376,9 @@ mod tests { use super::*; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::mmio::VIRTIO_MMIO_INT_CONFIG; + use crate::devices::virtio::vhost_user::tests::create_mem; use crate::test_utils::create_tmp_socket; - use crate::vstate::memory::{GuestAddress, GuestMemoryExtension}; + use crate::vstate::memory::GuestAddress; #[test] fn test_from_config() { @@ -778,9 +779,7 @@ mod tests { let file = TempFile::new().unwrap().into_file(); file.set_len(region_size as u64).unwrap(); let regions = vec![(GuestAddress(0x0), region_size)]; - let guest_memory = - GuestMemoryMmap::create(regions.into_iter(), libc::MAP_PRIVATE, Some(file), false) - .unwrap(); + let guest_memory = create_mem(file, ®ions); // During actiavion of the device features, memory and queues should be set and activated. vhost_block.activate(guest_memory).unwrap(); diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index a43ab1e3b4b..79fe7c0c77d 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -194,7 +194,8 @@ pub mod tests { use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::utils::u64_to_usize; use crate::vmm_config::machine_config::HugePageConfig; - use crate::vstate::memory::{Bitmap, Bytes, GuestMemory, GuestMemoryExtension}; + use crate::vstate::memory; + use crate::vstate::memory::{Bitmap, Bytes, GuestMemory}; const FILE_LEN: u32 = 1024; // 2 pages of memory should be enough to test read/write ops and also dirty tracking. @@ -230,10 +231,13 @@ pub mod tests { } fn create_mem() -> GuestMemoryMmap { - GuestMemoryMmap::anonymous( - [(GuestAddress(0), MEM_LEN)].into_iter(), - true, - HugePageConfig::None, + GuestMemoryMmap::from_regions( + memory::anonymous( + [(GuestAddress(0), MEM_LEN)].into_iter(), + true, + HugePageConfig::None, + ) + .unwrap(), ) .unwrap() } diff --git a/src/vmm/src/devices/virtio/persist.rs b/src/vmm/src/devices/virtio/persist.rs index 7551921f4b0..7c861352317 100644 --- a/src/vmm/src/devices/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/persist.rs @@ -22,7 +22,7 @@ use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; pub enum PersistError { /// Snapshot state contains invalid queue info. InvalidInput, - /// Could not restore queue. + /// Could not restore queue: {0} QueueConstruction(QueueError), } diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index 59efcb1a653..83174fbc4d3 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -459,14 +459,30 @@ impl VhostUserHandleImpl { } #[cfg(test)] -mod tests { +pub(crate) mod tests { #![allow(clippy::undocumented_unsafe_blocks)] + use std::fs::File; + use vmm_sys_util::tempfile::TempFile; use super::*; use crate::test_utils::create_tmp_socket; - use crate::vstate::memory::{GuestAddress, GuestMemoryExtension}; + use crate::vstate::memory; + use crate::vstate::memory::GuestAddress; + + pub(crate) fn create_mem(file: File, regions: &[(GuestAddress, usize)]) -> GuestMemoryMmap { + GuestMemoryMmap::from_regions( + memory::create( + regions.iter().copied(), + libc::MAP_PRIVATE, + Some(file), + false, + ) + .unwrap(), + ) + .unwrap() + } #[test] fn test_new() { @@ -763,9 +779,7 @@ mod tests { (GuestAddress(0x10000), region_size), ]; - let guest_memory = - GuestMemoryMmap::create(regions.into_iter(), libc::MAP_PRIVATE, Some(file), false) - .unwrap(); + let guest_memory = create_mem(file, ®ions); vuh.update_mem_table(&guest_memory).unwrap(); @@ -879,9 +893,7 @@ mod tests { file.set_len(region_size as u64).unwrap(); let regions = vec![(GuestAddress(0x0), region_size)]; - let guest_memory = - GuestMemoryMmap::create(regions.into_iter(), libc::MAP_PRIVATE, Some(file), false) - .unwrap(); + let guest_memory = create_mem(file, ®ions); let mut queue = Queue::new(69); queue.initialize(&guest_memory).unwrap(); diff --git a/src/vmm/src/gdb/arch/aarch64.rs b/src/vmm/src/gdb/arch/aarch64.rs index 9504a48fcc5..a462d677ff3 100644 --- a/src/vmm/src/gdb/arch/aarch64.rs +++ b/src/vmm/src/gdb/arch/aarch64.rs @@ -63,7 +63,9 @@ const PTE_ADDRESS_MASK: u64 = !0b111u64; /// Read a u64 value from a guest memory address fn read_address(vmm: &Vmm, address: u64) -> Result { let mut buf = [0; 8]; - vmm.guest_memory.read(&mut buf, GuestAddress(address))?; + vmm.vm + .guest_memory() + .read(&mut buf, GuestAddress(address))?; Ok(u64::from_le_bytes(buf)) } diff --git a/src/vmm/src/gdb/target.rs b/src/vmm/src/gdb/target.rs index 3ff96d0c8b5..666861a0adb 100644 --- a/src/vmm/src/gdb/target.rs +++ b/src/vmm/src/gdb/target.rs @@ -399,7 +399,8 @@ impl MultiThreadBase for FirecrackerTarget { GUEST_PAGE_SIZE - (u64_to_usize(gpa) & (GUEST_PAGE_SIZE - 1)), ); - vmm.guest_memory + vmm.vm + .guest_memory() .read(&mut data[..read_len], GuestAddress(gpa as u64)) .map_err(|e| { error!("Error reading memory {e:?} gpa is {gpa}"); @@ -433,7 +434,8 @@ impl MultiThreadBase for FirecrackerTarget { GUEST_PAGE_SIZE - (u64_to_usize(gpa) & (GUEST_PAGE_SIZE - 1)), ); - vmm.guest_memory + vmm.vm + .guest_memory() .write(&data[..write_len], GuestAddress(gpa)) .map_err(|e| { error!("Error {e:?} writing memory at {gpa:#X}"); diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 5f173d3ae9c..29f3b0148ac 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -149,11 +149,8 @@ use crate::logger::{METRICS, MetricsError, error, info, warn}; use crate::persist::{MicrovmState, MicrovmStateError, VmInfo}; use crate::rate_limiter::BucketUpdate; use crate::snapshot::Persist; -use crate::utils::u64_to_usize; use crate::vmm_config::instance_info::{InstanceInfo, VmState}; -use crate::vstate::memory::{ - GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, -}; +use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; use crate::vstate::vcpu::VcpuState; pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse}; pub use crate::vstate::vm::Vm; @@ -265,7 +262,7 @@ pub enum VmmError { } /// Shorthand type for KVM dirty page bitmap. -pub type DirtyBitmap = HashMap>; +pub type DirtyBitmap = HashMap>; /// Returns the size of guest memory, in MiB. pub(crate) fn mem_size_mib(guest_memory: &GuestMemoryMmap) -> u64 { @@ -309,8 +306,8 @@ pub struct Vmm { // Guest VM core resources. kvm: Kvm, - vm: Vm, - guest_memory: GuestMemoryMmap, + /// VM object + pub vm: Vm, // Save UFFD in order to keep it open in the Firecracker process, as well. uffd: Option, vcpus_handles: Vec, @@ -521,12 +518,10 @@ impl Vmm { }; let device_states = self.mmio_device_manager.save(); - let memory_state = self.guest_memory.describe(); let acpi_dev_state = self.acpi_device_manager.save(); Ok(MicrovmState { vm_info: vm_info.clone(), - memory_state, kvm_state, vm_state, vcpu_states, @@ -591,37 +586,6 @@ impl Vmm { Ok(cpu_configs) } - /// Retrieves the KVM dirty bitmap for each of the guest's memory regions. - pub fn reset_dirty_bitmap(&self) { - self.guest_memory - .iter() - .enumerate() - .for_each(|(slot, region)| { - let _ = self - .vm - .fd() - .get_dirty_log(u32::try_from(slot).unwrap(), u64_to_usize(region.len())); - }); - } - - /// Retrieves the KVM dirty bitmap for each of the guest's memory regions. - pub fn get_dirty_bitmap(&self) -> Result { - let mut bitmap: DirtyBitmap = HashMap::new(); - self.guest_memory - .iter() - .enumerate() - .try_for_each(|(slot, region)| { - let bitmap_region = self - .vm - .fd() - .get_dirty_log(u32::try_from(slot).unwrap(), u64_to_usize(region.len()))?; - bitmap.insert(slot, bitmap_region); - Ok(()) - }) - .map_err(VmmError::DirtyBitmap)?; - Ok(bitmap) - } - /// Updates the path of the host file backing the emulated block device with id `drive_id`. /// We update the disk image on the device and its virtio configuration. pub fn update_block_device_path( @@ -736,7 +700,7 @@ impl Vmm { pub fn update_balloon_config(&mut self, amount_mib: u32) -> Result<(), BalloonError> { // The balloon cannot have a target size greater than the size of // the guest memory. - if u64::from(amount_mib) > mem_size_mib(&self.guest_memory) { + if u64::from(amount_mib) > mem_size_mib(self.vm.guest_memory()) { return Err(BalloonError::TooManyPagesRequested); } diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 067cb51896b..be2fbc0a040 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -34,16 +34,13 @@ use crate::utils::u64_to_usize; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::{HugePageConfig, MachineConfigError, MachineConfigUpdate}; -use crate::vmm_config::snapshot::{ - CreateSnapshotParams, LoadSnapshotParams, MemBackendType, SnapshotType, -}; +use crate::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams, MemBackendType}; use crate::vstate::kvm::KvmState; -use crate::vstate::memory::{ - GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryState, MemoryError, -}; +use crate::vstate::memory; +use crate::vstate::memory::{GuestMemoryState, GuestRegionMmap, MemoryError}; use crate::vstate::vcpu::{VcpuSendEventError, VcpuState}; use crate::vstate::vm::VmState; -use crate::{EventManager, Vmm, VmmError, mem_size_mib, vstate}; +use crate::{EventManager, Vmm, vstate}; /// Holds information related to the VM that is not part of VmState. #[derive(Clone, Debug, Default, Deserialize, PartialEq, Eq, Serialize)] @@ -77,8 +74,6 @@ impl From<&VmResources> for VmInfo { pub struct MicrovmState { /// Miscellaneous VM info. pub vm_info: VmInfo, - /// Memory state. - pub memory_state: GuestMemoryState, /// KVM KVM state. pub kvm_state: KvmState, /// VM KVM state. @@ -139,15 +134,15 @@ pub enum MicrovmStateError { #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum CreateSnapshotError { /// Cannot get dirty bitmap: {0} - DirtyBitmap(VmmError), + DirtyBitmap(#[from] vmm_sys_util::errno::Error), /// Cannot write memory file: {0} - Memory(MemoryError), + Memory(#[from] MemoryError), /// Cannot perform {0} on the memory backing file: {1} MemoryBackingFile(&'static str, io::Error), /// Cannot save the microVM state: {0} MicrovmState(MicrovmStateError), /// Cannot serialize the microVM state: {0} - SerializeMicrovmState(crate::snapshot::SnapshotError), + SerializeMicrovmState(#[from] crate::snapshot::SnapshotError), /// Cannot perform {0} on the snapshot backing file: {1} SnapshotBackingFile(&'static str, io::Error), } @@ -167,7 +162,25 @@ pub fn create_snapshot( snapshot_state_to_file(µvm_state, ¶ms.snapshot_path)?; - snapshot_memory_to_file(vmm, ¶ms.mem_file_path, params.snapshot_type)?; + vmm.vm + .snapshot_memory_to_file(¶ms.mem_file_path, params.snapshot_type)?; + + // We need to mark queues as dirty again for all activated devices. The reason we + // do it here is because we don't mark pages as dirty during runtime + // for queue objects. + // SAFETY: + // This should never fail as we only mark pages only if device has already been activated, + // and the address validation was already performed on device activation. + vmm.mmio_device_manager + .for_each_virtio_device(|_, _, _, dev| { + let d = dev.lock().unwrap(); + if d.is_activated() { + d.mark_queue_memory_dirty(vmm.vm.guest_memory()) + } else { + Ok(()) + } + }) + .unwrap(); Ok(()) } @@ -185,9 +198,7 @@ fn snapshot_state_to_file( .map_err(|err| SnapshotBackingFile("open", err))?; let snapshot = Snapshot::new(SNAPSHOT_VERSION); - snapshot - .save(&mut snapshot_file, microvm_state) - .map_err(SerializeMicrovmState)?; + snapshot.save(&mut snapshot_file, microvm_state)?; snapshot_file .flush() .map_err(|err| SnapshotBackingFile("flush", err))?; @@ -196,96 +207,6 @@ fn snapshot_state_to_file( .map_err(|err| SnapshotBackingFile("sync_all", err)) } -/// Takes a snapshot of the virtual machine running inside the given [`Vmm`] and saves it to -/// `mem_file_path`. -/// -/// If `snapshot_type` is [`SnapshotType::Diff`], and `mem_file_path` exists and is a snapshot file -/// of matching size, then the diff snapshot will be directly merged into the existing snapshot. -/// Otherwise, existing files are simply overwritten. -fn snapshot_memory_to_file( - vmm: &Vmm, - mem_file_path: &Path, - snapshot_type: SnapshotType, -) -> Result<(), CreateSnapshotError> { - use self::CreateSnapshotError::*; - - // Need to check this here, as we create the file in the line below - let file_existed = mem_file_path.exists(); - - let mut file = OpenOptions::new() - .write(true) - .create(true) - .truncate(false) - .open(mem_file_path) - .map_err(|err| MemoryBackingFile("open", err))?; - - // Determine what size our total memory area is. - let mem_size_mib = mem_size_mib(&vmm.guest_memory); - let expected_size = mem_size_mib * 1024 * 1024; - - if file_existed { - let file_size = file - .metadata() - .map_err(|e| MemoryBackingFile("get_metadata", e))? - .len(); - - // Here we only truncate the file if the size mismatches. - // - For full snapshots, the entire file's contents will be overwritten anyway. We have to - // avoid truncating here to deal with the edge case where it represents the snapshot file - // from which this very microVM was loaded (as modifying the memory file would be - // reflected in the mmap of the file, meaning a truncate operation would zero out guest - // memory, and thus corrupt the VM). - // - For diff snapshots, we want to merge the diff layer directly into the file. - if file_size != expected_size { - file.set_len(0) - .map_err(|err| MemoryBackingFile("truncate", err))?; - } - } - - // Set the length of the file to the full size of the memory area. - file.set_len(expected_size) - .map_err(|e| MemoryBackingFile("set_length", e))?; - - match snapshot_type { - SnapshotType::Diff => { - let dirty_bitmap = vmm.get_dirty_bitmap().map_err(DirtyBitmap)?; - vmm.guest_memory - .dump_dirty(&mut file, &dirty_bitmap) - .map_err(Memory) - } - SnapshotType::Full => { - let dump_res = vmm.guest_memory.dump(&mut file).map_err(Memory); - if dump_res.is_ok() { - vmm.reset_dirty_bitmap(); - vmm.guest_memory.reset_dirty(); - } - - dump_res - } - }?; - // We need to mark queues as dirty again for all activated devices. The reason we - // do it here is because we don't mark pages as dirty during runtime - // for queue objects. - // SAFETY: - // This should never fail as we only mark pages only if device has already been activated, - // and the address validation was already performed on device activation. - vmm.mmio_device_manager - .for_each_virtio_device(|_, _, _, dev| { - let d = dev.lock().unwrap(); - if d.is_activated() { - d.mark_queue_memory_dirty(&vmm.guest_memory) - } else { - Ok(()) - } - }) - .unwrap(); - - file.flush() - .map_err(|err| MemoryBackingFile("flush", err))?; - file.sync_all() - .map_err(|err| MemoryBackingFile("sync_all", err)) -} - /// Validates that snapshot CPU vendor matches the host CPU vendor. /// /// # Errors @@ -369,7 +290,7 @@ pub fn snapshot_state_sanity_check( // Check if the snapshot contains at least 1 mem region. // Upper bound check will be done when creating guest memory by comparing against // KVM max supported value kvm_context.max_memslots(). - if microvm_state.memory_state.regions.is_empty() { + if microvm_state.vm_state.memory.regions.is_empty() { return Err(SnapShotStateSanityCheckError::NoMemory); } @@ -452,7 +373,7 @@ pub fn restore_from_snapshot( snapshot_state_sanity_check(µvm_state)?; let mem_backend_path = ¶ms.mem_backend.backend_path; - let mem_state = µvm_state.memory_state; + let mem_state = µvm_state.vm_state.memory; let (guest_memory, uffd) = match params.mem_backend.backend_type { MemBackendType::File => { @@ -472,9 +393,6 @@ pub fn restore_from_snapshot( mem_backend_path, mem_state, track_dirty_pages, - // We enable the UFFD_FEATURE_EVENT_REMOVE feature only if a balloon device - // is present in the microVM state. - microvm_state.device_states.balloon_device.is_some(), vm_resources.machine_config.huge_pages, ) .map_err(RestoreFromSnapshotGuestMemoryError::Uffd)?, @@ -533,9 +451,9 @@ fn guest_memory_from_file( mem_file_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, -) -> Result { +) -> Result, GuestMemoryFromFileError> { let mem_file = File::open(mem_file_path)?; - let guest_mem = GuestMemoryMmap::snapshot_file(mem_file, mem_state, track_dirty_pages)?; + let guest_mem = memory::snapshot_file(mem_file, mem_state.regions(), track_dirty_pages)?; Ok(guest_mem) } @@ -558,19 +476,18 @@ fn guest_memory_from_uffd( mem_uds_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, - enable_balloon: bool, huge_pages: HugePageConfig, -) -> Result<(GuestMemoryMmap, Option), GuestMemoryFromUffdError> { +) -> Result<(Vec, Option), GuestMemoryFromUffdError> { let (guest_memory, backend_mappings) = create_guest_memory(mem_state, track_dirty_pages, huge_pages)?; let mut uffd_builder = UffdBuilder::new(); - if enable_balloon { - // We enable this so that the page fault handler can add logic - // for treating madvise(MADV_DONTNEED) events triggerd by balloon inflation. - uffd_builder.require_features(FeatureFlags::EVENT_REMOVE); - } + // We only make use of this if balloon devices are present, but we can enable it unconditionally + // because the only place the kernel checks this is in a hook from madvise, e.g. it doesn't + // actively change the behavior of UFFD, only passively. Without balloon devices + // we never call madvise anyway, so no need to put this into a conditional. + uffd_builder.require_features(FeatureFlags::EVENT_REMOVE); let uffd = uffd_builder .close_on_exec(true) @@ -593,10 +510,9 @@ fn create_guest_memory( mem_state: &GuestMemoryState, track_dirty_pages: bool, huge_pages: HugePageConfig, -) -> Result<(GuestMemoryMmap, Vec), GuestMemoryFromUffdError> { - let guest_memory = - GuestMemoryMmap::anonymous(mem_state.regions(), track_dirty_pages, huge_pages)?; - let mut backend_mappings = Vec::with_capacity(guest_memory.num_regions()); +) -> Result<(Vec, Vec), GuestMemoryFromUffdError> { + let guest_memory = memory::anonymous(mem_state.regions(), track_dirty_pages, huge_pages)?; + let mut backend_mappings = Vec::with_capacity(guest_memory.len()); let mut offset = 0; for mem_region in guest_memory.iter() { #[allow(deprecated)] @@ -753,13 +669,11 @@ mod tests { assert!(states.vsock_device.is_some()); assert!(states.balloon_device.is_some()); - let memory_state = vmm.guest_memory.describe(); let vcpu_states = vec![VcpuState::default()]; #[cfg(target_arch = "aarch64")] let mpidrs = construct_kvm_mpidrs(&vcpu_states); let microvm_state = MicrovmState { device_states: states, - memory_state, vcpu_states, kvm_state: Default::default(), vm_info: VmInfo { diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 837ddd91069..097e2041b55 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -29,7 +29,8 @@ use crate::vmm_config::metrics::{MetricsConfig, MetricsConfigError, init_metrics use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::*; use crate::vmm_config::vsock::*; -use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryMmap, MemoryError}; +use crate::vstate::memory; +use crate::vstate::memory::{GuestRegionMmap, MemoryError}; /// Errors encountered when configuring microVM resources. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -440,7 +441,7 @@ impl VmResources { /// /// If vhost-user-blk devices are in use, allocates memfd-backed shared memory, otherwise /// prefers anonymous memory for performance reasons. - pub fn allocate_guest_memory(&self) -> Result { + pub fn allocate_guest_memory(&self) -> Result, MemoryError> { let vhost_user_device_used = self .block .devices @@ -456,16 +457,16 @@ impl VmResources { // because that would require running a backend process. If in the future we converge to // a single way of backing guest memory for vhost-user and non-vhost-user cases, // that would not be worth the effort. + let regions = + crate::arch::arch_memory_regions(0, mib_to_bytes(self.machine_config.mem_size_mib)); if vhost_user_device_used { - GuestMemoryMmap::memfd_backed( - self.machine_config.mem_size_mib, + memory::memfd_backed( + regions.as_ref(), self.machine_config.track_dirty_pages, self.machine_config.huge_pages, ) } else { - let regions = - crate::arch::arch_memory_regions(mib_to_bytes(self.machine_config.mem_size_mib)); - GuestMemoryMmap::anonymous( + memory::anonymous( regions.into_iter(), self.machine_config.track_dirty_pages, self.machine_config.huge_pages, diff --git a/src/vmm/src/test_utils/mod.rs b/src/vmm/src/test_utils/mod.rs index 79026ba6de3..7cb16a2a213 100644 --- a/src/vmm/src/test_utils/mod.rs +++ b/src/vmm/src/test_utils/mod.rs @@ -15,7 +15,8 @@ use crate::test_utils::mock_resources::{MockBootSourceConfig, MockVmConfig, Mock use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::HugePageConfig; -use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryMmap}; +use crate::vstate::memory; +use crate::vstate::memory::{GuestMemoryMmap, GuestRegionMmap}; use crate::{EventManager, Vmm}; pub mod mock_resources; @@ -26,22 +27,42 @@ pub fn single_region_mem(region_size: usize) -> GuestMemoryMmap { single_region_mem_at(0, region_size) } +pub fn single_region_mem_raw(region_size: usize) -> Vec { + single_region_mem_at_raw(0, region_size) +} + /// Creates a [`GuestMemoryMmap`] with a single region of the given size starting at the given /// guest physical address `at` and without dirty tracking. pub fn single_region_mem_at(at: u64, size: usize) -> GuestMemoryMmap { multi_region_mem(&[(GuestAddress(at), size)]) } +pub fn single_region_mem_at_raw(at: u64, size: usize) -> Vec { + multi_region_mem_raw(&[(GuestAddress(at), size)]) +} + /// Creates a [`GuestMemoryMmap`] with multiple regions and without dirty page tracking. pub fn multi_region_mem(regions: &[(GuestAddress, usize)]) -> GuestMemoryMmap { - GuestMemoryMmap::anonymous(regions.iter().copied(), false, HugePageConfig::None) + GuestMemoryMmap::from_regions( + memory::anonymous(regions.iter().copied(), false, HugePageConfig::None) + .expect("Cannot initialize memory"), + ) + .unwrap() +} + +pub fn multi_region_mem_raw(regions: &[(GuestAddress, usize)]) -> Vec { + memory::anonymous(regions.iter().copied(), false, HugePageConfig::None) .expect("Cannot initialize memory") } /// Creates a [`GuestMemoryMmap`] of the given size with the contained regions laid out in /// accordance with the requirements of the architecture on which the tests are being run. pub fn arch_mem(mem_size_bytes: usize) -> GuestMemoryMmap { - multi_region_mem(&crate::arch::arch_memory_regions(mem_size_bytes)) + multi_region_mem(&crate::arch::arch_memory_regions(0, mem_size_bytes)) +} + +pub fn arch_mem_raw(mem_size_bytes: usize) -> Vec { + multi_region_mem_raw(&crate::arch::arch_memory_regions(0, mem_size_bytes)) } pub fn create_vmm( diff --git a/src/vmm/src/vstate/kvm.rs b/src/vmm/src/vstate/kvm.rs index b89d0e1a0d8..c857aa83080 100644 --- a/src/vmm/src/vstate/kvm.rs +++ b/src/vmm/src/vstate/kvm.rs @@ -7,7 +7,6 @@ use serde::{Deserialize, Serialize}; pub use crate::arch::{Kvm, KvmArchError}; use crate::cpu_config::templates::KvmCapability; -use crate::vstate::memory::{GuestMemory, GuestMemoryMmap}; /// Errors associated with the wrappers over KVM ioctls. /// Needs `rustfmt::skip` to make multiline comments work @@ -21,8 +20,6 @@ pub enum KvmError { /** Error creating KVM object: {0} Make sure the user launching the firecracker process is \ configured on the /dev/kvm file's ACL. */ Kvm(kvm_ioctls::Error), - /// The number of configured slots is bigger than the maximum reported by KVM - NotEnoughMemorySlots, /// Architecture specific error: {0} ArchError(#[from] KvmArchError) } @@ -43,18 +40,7 @@ impl Kvm { // Check that all desired capabilities are supported. Self::check_capabilities(&kvm_fd, &total_caps).map_err(KvmError::Capabilities)?; - let max_memslots = kvm_fd.get_nr_memslots(); - - Ok(Kvm::init_arch(kvm_fd, max_memslots, kvm_cap_modifiers)?) - } - - /// Check guest memory does not have more regions than kvm allows. - pub fn check_memory(&self, guest_mem: &GuestMemoryMmap) -> Result<(), KvmError> { - if guest_mem.num_regions() > self.max_memslots { - Err(KvmError::NotEnoughMemorySlots) - } else { - Ok(()) - } + Ok(Kvm::init_arch(kvm_fd, kvm_cap_modifiers)?) } fn combine_capabilities(kvm_cap_modifiers: &[KvmCapability]) -> Vec { @@ -92,6 +78,11 @@ impl Kvm { kvm_cap_modifiers: self.kvm_cap_modifiers.clone(), } } + + /// Returns the maximal number of memslots allowed in a [`Vm`] + pub fn max_nr_memslots(&self) -> usize { + self.fd.get_nr_memslots() + } } /// Structure holding an general specific VM state. diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 8875791f3a6..19367f7f997 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -9,7 +9,6 @@ use std::fs::File; use std::io::SeekFrom; use std::sync::Arc; -use libc::c_int; use serde::{Deserialize, Serialize}; pub use vm_memory::bitmap::{AtomicBitmap, BS, Bitmap, BitmapSlice}; pub use vm_memory::mmap::MmapRegionBuilder; @@ -22,8 +21,7 @@ use vm_memory::{Error as VmMemoryError, GuestMemoryError, WriteVolatile}; use vmm_sys_util::errno; use crate::DirtyBitmap; -use crate::arch::arch_memory_regions; -use crate::utils::{get_page_size, mib_to_bytes, u64_to_usize}; +use crate::utils::{get_page_size, u64_to_usize}; use crate::vmm_config::machine_config::HugePageConfig; /// Type of GuestMemoryMmap. @@ -52,65 +50,93 @@ pub enum MemoryError { OffsetTooLarge, } +/// Creates a `Vec` of `GuestRegionMmap` with the given configuration +pub fn create( + regions: impl Iterator, + mmap_flags: libc::c_int, + file: Option, + track_dirty_pages: bool, +) -> Result, MemoryError> { + let mut offset = 0; + let file = file.map(Arc::new); + regions + .map(|(start, size)| { + let mut builder = MmapRegionBuilder::new_with_bitmap( + size, + track_dirty_pages.then(|| AtomicBitmap::with_len(size)), + ) + .with_mmap_prot(libc::PROT_READ | libc::PROT_WRITE) + .with_mmap_flags(libc::MAP_NORESERVE | mmap_flags); + + if let Some(ref file) = file { + let file_offset = FileOffset::from_arc(Arc::clone(file), offset); + + builder = builder.with_file_offset(file_offset); + } + + offset = match offset.checked_add(size as u64) { + None => return Err(MemoryError::OffsetTooLarge), + Some(new_off) if new_off >= i64::MAX as u64 => { + return Err(MemoryError::OffsetTooLarge); + } + Some(new_off) => new_off, + }; + + GuestRegionMmap::new( + builder.build().map_err(MemoryError::MmapRegionError)?, + start, + ) + .map_err(MemoryError::VmMemoryError) + }) + .collect::, _>>() +} + +/// Creates a GuestMemoryMmap with `size` in MiB backed by a memfd. +pub fn memfd_backed( + regions: &[(GuestAddress, usize)], + track_dirty_pages: bool, + huge_pages: HugePageConfig, +) -> Result, MemoryError> { + let size = regions.iter().map(|&(_, size)| size as u64).sum(); + let memfd_file = create_memfd(size, huge_pages.into())?.into_file(); + + create( + regions.iter().copied(), + libc::MAP_SHARED | huge_pages.mmap_flags(), + Some(memfd_file), + track_dirty_pages, + ) +} + +/// Creates a GuestMemoryMmap from raw regions. +pub fn anonymous( + regions: impl Iterator, + track_dirty_pages: bool, + huge_pages: HugePageConfig, +) -> Result, MemoryError> { + create( + regions, + libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | huge_pages.mmap_flags(), + None, + track_dirty_pages, + ) +} + +/// Creates a GuestMemoryMmap given a `file` containing the data +/// and a `state` containing mapping information. +pub fn snapshot_file( + file: File, + regions: impl Iterator, + track_dirty_pages: bool, +) -> Result, MemoryError> { + create(regions, libc::MAP_PRIVATE, Some(file), track_dirty_pages) +} + /// Defines the interface for snapshotting memory. pub trait GuestMemoryExtension where Self: Sized, { - /// Creates a [`GuestMemoryMmap`] with the given configuration - fn create( - regions: impl Iterator, - mmap_flags: libc::c_int, - file: Option, - track_dirty_pages: bool, - ) -> Result; - - /// Creates a GuestMemoryMmap with `size` in MiB backed by a memfd. - fn memfd_backed( - mem_size_mib: usize, - track_dirty_pages: bool, - huge_pages: HugePageConfig, - ) -> Result { - let memfd_file = create_memfd(mem_size_mib, huge_pages.into())?.into_file(); - let regions = arch_memory_regions(mib_to_bytes(mem_size_mib)).into_iter(); - - Self::create( - regions, - libc::MAP_SHARED | huge_pages.mmap_flags(), - Some(memfd_file), - track_dirty_pages, - ) - } - - /// Creates a GuestMemoryMmap from raw regions. - fn anonymous( - regions: impl Iterator, - track_dirty_pages: bool, - huge_pages: HugePageConfig, - ) -> Result { - Self::create( - regions, - libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | huge_pages.mmap_flags(), - None, - track_dirty_pages, - ) - } - - /// Creates a GuestMemoryMmap given a `file` containing the data - /// and a `state` containing mapping information. - fn snapshot_file( - file: File, - state: &GuestMemoryState, - track_dirty_pages: bool, - ) -> Result { - Self::create( - state.regions(), - libc::MAP_PRIVATE, - Some(file), - track_dirty_pages, - ) - } - /// Describes GuestMemoryMmap through a GuestMemoryState struct. fn describe(&self) -> GuestMemoryState; @@ -163,48 +189,6 @@ impl GuestMemoryState { } impl GuestMemoryExtension for GuestMemoryMmap { - fn create( - regions: impl Iterator, - mmap_flags: c_int, - file: Option, - track_dirty_pages: bool, - ) -> Result { - let mut offset = 0; - let file = file.map(Arc::new); - let regions = regions - .map(|(start, size)| { - let mut builder = MmapRegionBuilder::new_with_bitmap( - size, - track_dirty_pages.then(|| AtomicBitmap::with_len(size)), - ) - .with_mmap_prot(libc::PROT_READ | libc::PROT_WRITE) - .with_mmap_flags(libc::MAP_NORESERVE | mmap_flags); - - if let Some(ref file) = file { - let file_offset = FileOffset::from_arc(Arc::clone(file), offset); - - builder = builder.with_file_offset(file_offset); - } - - offset = match offset.checked_add(size as u64) { - None => return Err(MemoryError::OffsetTooLarge), - Some(new_off) if new_off >= i64::MAX as u64 => { - return Err(MemoryError::OffsetTooLarge); - } - Some(new_off) => new_off, - }; - - GuestRegionMmap::new( - builder.build().map_err(MemoryError::MmapRegionError)?, - start, - ) - .map_err(MemoryError::VmMemoryError) - }) - .collect::, _>>()?; - - GuestMemoryMmap::from_regions(regions).map_err(MemoryError::VmMemoryError) - } - /// Describes GuestMemoryMmap through a GuestMemoryState struct. fn describe(&self) -> GuestMemoryState { let mut guest_memory_state = GuestMemoryState::default(); @@ -243,7 +227,7 @@ impl GuestMemoryExtension for GuestMemoryMmap { let mut writer_offset = 0; let page_size = get_page_size().map_err(MemoryError::PageSize)?; - let write_result = self.iter().enumerate().try_for_each(|(slot, region)| { + let write_result = self.iter().zip(0..).try_for_each(|(region, slot)| { let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); let firecracker_bitmap = region.bitmap(); let mut write_size = 0; @@ -307,7 +291,7 @@ impl GuestMemoryExtension for GuestMemoryMmap { /// Stores the dirty bitmap inside into the internal bitmap fn store_dirty_bitmap(&self, dirty_bitmap: &DirtyBitmap, page_size: usize) { - self.iter().enumerate().for_each(|(slot, region)| { + self.iter().zip(0..).for_each(|(region, slot)| { let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); let firecracker_bitmap = region.bitmap(); @@ -327,10 +311,9 @@ impl GuestMemoryExtension for GuestMemoryMmap { } fn create_memfd( - size: usize, + mem_size: u64, hugetlb_size: Option, ) -> Result { - let mem_size = mib_to_bytes(size); // Create a memfd. let opts = memfd::MemfdOptions::default() .hugetlb(hugetlb_size) @@ -340,7 +323,7 @@ fn create_memfd( // Resize to guest mem size. mem_file .as_file() - .set_len(mem_size as u64) + .set_len(mem_size) .map_err(MemoryError::MemfdSetLen)?; // Add seals to prevent further resizing. @@ -368,7 +351,7 @@ mod tests { use super::*; use crate::snapshot::Snapshot; - use crate::utils::get_page_size; + use crate::utils::{get_page_size, mib_to_bytes}; #[test] fn test_anonymous() { @@ -381,7 +364,7 @@ mod tests { (GuestAddress(0x30000), region_size), ]; - let guest_memory = GuestMemoryMmap::anonymous( + let guest_memory = anonymous( regions.into_iter(), dirty_page_tracking, HugePageConfig::None, @@ -403,8 +386,10 @@ mod tests { (GuestAddress(region_size as u64), region_size), // pages 3-5 (GuestAddress(region_size as u64 * 2), region_size), // pages 6-8 ]; - let guest_memory = - GuestMemoryMmap::anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap(); + let guest_memory = GuestMemoryMmap::from_regions( + anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap(), + ) + .unwrap(); let dirty_map = [ // page 0: not dirty @@ -459,10 +444,13 @@ mod tests { let region_size = page_size * 3; // Test with a single region - let guest_memory = GuestMemoryMmap::anonymous( - [(GuestAddress(0), region_size)].into_iter(), - false, - HugePageConfig::None, + let guest_memory = GuestMemoryMmap::from_regions( + anonymous( + [(GuestAddress(0), region_size)].into_iter(), + false, + HugePageConfig::None, + ) + .unwrap(), ) .unwrap(); check_serde(&guest_memory); @@ -473,8 +461,10 @@ mod tests { (GuestAddress(region_size as u64), region_size), // pages 3-5 (GuestAddress(region_size as u64 * 2), region_size), // pages 6-8 ]; - let guest_memory = - GuestMemoryMmap::anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap(); + let guest_memory = GuestMemoryMmap::from_regions( + anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap(), + ) + .unwrap(); check_serde(&guest_memory); } @@ -487,9 +477,10 @@ mod tests { (GuestAddress(0), page_size), (GuestAddress(page_size as u64 * 2), page_size), ]; - let guest_memory = - GuestMemoryMmap::anonymous(mem_regions.into_iter(), true, HugePageConfig::None) - .unwrap(); + let guest_memory = GuestMemoryMmap::from_regions( + anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), + ) + .unwrap(); let expected_memory_state = GuestMemoryState { regions: vec![ @@ -512,9 +503,10 @@ mod tests { (GuestAddress(0), page_size * 3), (GuestAddress(page_size as u64 * 4), page_size * 3), ]; - let guest_memory = - GuestMemoryMmap::anonymous(mem_regions.into_iter(), true, HugePageConfig::None) - .unwrap(); + let guest_memory = GuestMemoryMmap::from_regions( + anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), + ) + .unwrap(); let expected_memory_state = GuestMemoryState { regions: vec![ @@ -545,9 +537,10 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = - GuestMemoryMmap::anonymous(mem_regions.into_iter(), true, HugePageConfig::None) - .unwrap(); + let guest_memory = GuestMemoryMmap::from_regions( + anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), + ) + .unwrap(); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { assert!(!r.bitmap().dirty_at(0)); @@ -569,8 +562,10 @@ mod tests { let mut memory_file = TempFile::new().unwrap().into_file(); guest_memory.dump(&mut memory_file).unwrap(); - let restored_guest_memory = - GuestMemoryMmap::snapshot_file(memory_file, &memory_state, false).unwrap(); + let restored_guest_memory = GuestMemoryMmap::from_regions( + snapshot_file(memory_file, memory_state.regions(), false).unwrap(), + ) + .unwrap(); // Check that the region contents are the same. let mut restored_region = vec![0u8; page_size * 2]; @@ -597,9 +592,10 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = - GuestMemoryMmap::anonymous(mem_regions.into_iter(), true, HugePageConfig::None) - .unwrap(); + let guest_memory = GuestMemoryMmap::from_regions( + anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), + ) + .unwrap(); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { assert!(!r.bitmap().dirty_at(0)); @@ -628,8 +624,10 @@ mod tests { guest_memory.dump_dirty(&mut file, &dirty_bitmap).unwrap(); // We can restore from this because this is the first dirty dump. - let restored_guest_memory = - GuestMemoryMmap::snapshot_file(file, &memory_state, false).unwrap(); + let restored_guest_memory = GuestMemoryMmap::from_regions( + snapshot_file(file, memory_state.regions(), false).unwrap(), + ) + .unwrap(); // Check that the region contents are the same. let mut restored_region = vec![0u8; region_size]; @@ -685,9 +683,10 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = - GuestMemoryMmap::anonymous(mem_regions.into_iter(), true, HugePageConfig::None) - .unwrap(); + let guest_memory = GuestMemoryMmap::from_regions( + anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), + ) + .unwrap(); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { @@ -712,12 +711,11 @@ mod tests { #[test] fn test_create_memfd() { - let size = 1; - let size_mb = mib_to_bytes(1) as u64; + let size_bytes = mib_to_bytes(1) as u64; - let memfd = create_memfd(size, None).unwrap(); + let memfd = create_memfd(size_bytes, None).unwrap(); - assert_eq!(memfd.as_file().metadata().unwrap().len(), size_mb); + assert_eq!(memfd.as_file().metadata().unwrap().len(), size_bytes); memfd.as_file().set_len(0x69).unwrap_err(); let mut seals = memfd::SealsHashSet::new(); diff --git a/src/vmm/src/vstate/vcpu.rs b/src/vmm/src/vstate/vcpu.rs index 4e1bf8970a0..825af33eea4 100644 --- a/src/vmm/src/vstate/vcpu.rs +++ b/src/vmm/src/vstate/vcpu.rs @@ -784,7 +784,7 @@ pub(crate) mod tests { #[test] fn test_handle_kvm_exit() { - let (_, _, mut vcpu, _vm_mem) = setup_vcpu(0x1000); + let (_, _, mut vcpu) = setup_vcpu(0x1000); let res = handle_kvm_exit(&mut vcpu.kvm_vcpu.peripherals, Ok(VcpuExit::Hlt)); assert_eq!(res.unwrap(), VcpuEmulation::Stopped); @@ -919,8 +919,8 @@ pub(crate) mod tests { // Auxiliary function being used throughout the tests. #[allow(unused_mut)] - pub(crate) fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, Vcpu, GuestMemoryMmap) { - let (kvm, mut vm, gm) = setup_vm_with_memory(mem_size); + pub(crate) fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, Vcpu) { + let (kvm, mut vm) = setup_vm_with_memory(mem_size); let (mut vcpus, _) = vm.create_vcpus(1).unwrap(); let mut vcpu = vcpus.remove(0); @@ -928,7 +928,7 @@ pub(crate) mod tests { #[cfg(target_arch = "aarch64")] vcpu.kvm_vcpu.init(&[]).unwrap(); - (kvm, vm, vcpu, gm) + (kvm, vm, vcpu) } fn load_good_kernel(vm_memory: &GuestMemoryMmap) -> GuestAddress { @@ -958,17 +958,17 @@ pub(crate) mod tests { entry_addr.kernel_load } - fn vcpu_configured_for_boot() -> (VcpuHandle, EventFd, GuestMemoryMmap) { + fn vcpu_configured_for_boot() -> (Vm, VcpuHandle, EventFd) { Vcpu::register_kick_signal_handler(); // Need enough mem to boot linux. let mem_size = mib_to_bytes(64); - let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(mem_size); + let (kvm, vm, mut vcpu) = setup_vcpu(mem_size); let vcpu_exit_evt = vcpu.exit_evt.try_clone().unwrap(); // Needs a kernel since we'll actually run this vcpu. let entry_point = EntryPoint { - entry_addr: load_good_kernel(&vm_mem), + entry_addr: load_good_kernel(vm.guest_memory()), protocol: BootProtocol::LinuxBoot, }; @@ -977,7 +977,7 @@ pub(crate) mod tests { use crate::cpu_config::x86_64::cpuid::Cpuid; vcpu.kvm_vcpu .configure( - &vm_mem, + vm.guest_memory(), entry_point, &VcpuConfig { vcpu_count: 1, @@ -994,7 +994,7 @@ pub(crate) mod tests { #[cfg(target_arch = "aarch64")] vcpu.kvm_vcpu .configure( - &vm_mem, + vm.guest_memory(), entry_point, &VcpuConfig { vcpu_count: 1, @@ -1013,12 +1013,12 @@ pub(crate) mod tests { // Wait for vCPUs to initialize their TLS before moving forward. barrier.wait(); - (vcpu_handle, vcpu_exit_evt, vm_mem) + (vm, vcpu_handle, vcpu_exit_evt) } #[test] fn test_set_mmio_bus() { - let (_, _, mut vcpu, _) = setup_vcpu(0x1000); + let (_, _, mut vcpu) = setup_vcpu(0x1000); assert!(vcpu.kvm_vcpu.peripherals.mmio_bus.is_none()); vcpu.set_mmio_bus(crate::devices::Bus::new()); assert!(vcpu.kvm_vcpu.peripherals.mmio_bus.is_some()); @@ -1026,7 +1026,7 @@ pub(crate) mod tests { #[test] fn test_vcpu_tls() { - let (_, _, mut vcpu, _mem) = setup_vcpu(0x1000); + let (_, _, mut vcpu) = setup_vcpu(0x1000); // Running on the TLS vcpu should fail before we actually initialize it. unsafe { @@ -1057,7 +1057,7 @@ pub(crate) mod tests { #[test] fn test_invalid_tls() { - let (_, _, mut vcpu, _) = setup_vcpu(0x1000); + let (_, _, mut vcpu) = setup_vcpu(0x1000); // Initialize vcpu TLS. vcpu.init_thread_local_data().unwrap(); // Trying to initialize non-empty TLS should error. @@ -1067,7 +1067,7 @@ pub(crate) mod tests { #[test] fn test_vcpu_kick() { Vcpu::register_kick_signal_handler(); - let (_, vm, mut vcpu, _mem) = setup_vcpu(0x1000); + let (_, vm, mut vcpu) = setup_vcpu(0x1000); let mut kvm_run = kvm_ioctls::KvmRunWrapper::mmap_from_fd(&vcpu.kvm_vcpu.fd, vm.fd().run_size()) @@ -1122,7 +1122,7 @@ pub(crate) mod tests { #[test] fn test_immediate_exit_shortcircuits_execution() { - let (_, _, mut vcpu, _mem) = setup_vcpu(0x1000); + let (_, _, mut vcpu) = setup_vcpu(0x1000); vcpu.kvm_vcpu.fd.set_kvm_immediate_exit(1); // Set a dummy value to be returned by the emulate call @@ -1147,7 +1147,7 @@ pub(crate) mod tests { #[test] fn test_vcpu_pause_resume() { - let (vcpu_handle, vcpu_exit_evt, _mem) = vcpu_configured_for_boot(); + let (_vm, vcpu_handle, vcpu_exit_evt) = vcpu_configured_for_boot(); // Queue a Resume event, expect a response. queue_event_expect_response(&vcpu_handle, VcpuEvent::Resume, VcpuResponse::Resumed); @@ -1179,7 +1179,7 @@ pub(crate) mod tests { #[test] fn test_vcpu_save_state_events() { - let (vcpu_handle, _vcpu_exit_evt, _mem) = vcpu_configured_for_boot(); + let (_vm, vcpu_handle, _vcpu_exit_evt) = vcpu_configured_for_boot(); // Queue a Resume event, expect a response. queue_event_expect_response(&vcpu_handle, VcpuEvent::Resume, VcpuResponse::Resumed); @@ -1212,7 +1212,7 @@ pub(crate) mod tests { #[test] fn test_vcpu_dump_cpu_config() { - let (vcpu_handle, _, _mem) = vcpu_configured_for_boot(); + let (_vm, vcpu_handle, _) = vcpu_configured_for_boot(); // Queue a DumpCpuConfig event, expect a DumpedCpuConfig response. vcpu_handle diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index bb13b17ee99..7a8965a4b9a 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -5,15 +5,36 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +use std::collections::HashMap; +use std::fs::OpenOptions; +use std::io::Write; +use std::path::Path; +use std::sync::Arc; + use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; use kvm_ioctls::VmFd; use vmm_sys_util::eventfd::EventFd; -use crate::Vcpu; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; use crate::logger::info; -use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; +use crate::persist::CreateSnapshotError; +use crate::utils::u64_to_usize; +use crate::vmm_config::snapshot::SnapshotType; +use crate::vstate::memory::{ + Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, +}; use crate::vstate::vcpu::VcpuError; +use crate::{DirtyBitmap, Vcpu, mem_size_mib}; + +/// Architecture independent parts of a VM. +#[derive(Debug)] +pub struct VmCommon { + /// The KVM file descriptor used to access this Vm. + pub fd: VmFd, + max_memslots: usize, + /// The guest memory of this Vm. + pub guest_memory: GuestMemoryMmap, +} /// Errors associated with the wrappers over KVM ioctls. /// Needs `rustfmt::skip` to make multiline comments work @@ -30,12 +51,16 @@ pub enum VmError { EventFd(std::io::Error), /// Failed to create vcpu: {0} CreateVcpu(VcpuError), + /// The number of configured slots is bigger than the maximum reported by KVM + NotEnoughMemorySlots, + /// Memory Error: {0} + VmMemory(#[from] vm_memory::Error), } /// Contains Vm functions that are usable across CPU architectures impl Vm { /// Create a KVM VM - pub fn create_vm(kvm: &crate::vstate::kvm::Kvm) -> Result { + pub fn create_common(kvm: &crate::vstate::kvm::Kvm) -> Result { // It is known that KVM_CREATE_VM occasionally fails with EINTR on heavily loaded machines // with many VMs. // @@ -57,9 +82,10 @@ impl Vm { // retry, they have to start Firecracker from scratch. Doing retries in Firecracker makes // recovery faster and improves reliability. const MAX_ATTEMPTS: u32 = 5; - for attempt in 1..=MAX_ATTEMPTS { + let mut attempt = 1; + let fd = loop { match kvm.fd.create_vm() { - Ok(fd) => return Ok(fd), + Ok(fd) => break fd, Err(e) if e.errno() == libc::EINTR && attempt < MAX_ATTEMPTS => { info!("Attempt #{attempt} of KVM_CREATE_VM returned EINTR"); // Exponential backoff (1us, 2us, 4us, and 8us => 15us in total) @@ -67,8 +93,15 @@ impl Vm { } Err(e) => return Err(VmError::CreateVm(e)), } - } - unreachable!(); + + attempt += 1; + }; + + Ok(VmCommon { + fd, + max_memslots: kvm.max_nr_memslots(), + guest_memory: GuestMemoryMmap::default(), + }) } /// Creates the specified number of [`Vcpu`]s. @@ -91,53 +124,170 @@ impl Vm { Ok((vcpus, exit_evt)) } - /// Initializes the guest memory. - pub fn memory_init(&self, guest_mem: &GuestMemoryMmap) -> Result<(), VmError> { - self.set_kvm_memory_regions(guest_mem) + /// Register a list of new memory regions to this [`Vm`]. + pub fn register_memory_regions( + &mut self, + regions: Vec, + ) -> Result<(), VmError> { + for region in regions { + self.register_memory_region(region)? + } + + Ok(()) } - pub(crate) fn set_kvm_memory_regions( - &self, - guest_mem: &GuestMemoryMmap, - ) -> Result<(), VmError> { - guest_mem - .iter() - .zip(0u32..) - .try_for_each(|(region, slot)| { - let flags = if region.bitmap().is_some() { - KVM_MEM_LOG_DIRTY_PAGES - } else { - 0 - }; - - let memory_region = kvm_userspace_memory_region { - slot, - guest_phys_addr: region.start_addr().raw_value(), - memory_size: region.len(), - userspace_addr: region.as_ptr() as u64, - flags, - }; - - // SAFETY: Safe because the fd is a valid KVM file descriptor. - unsafe { self.fd.set_user_memory_region(memory_region) } - }) - .map_err(VmError::SetUserMemoryRegion)?; + /// Register a new memory region to this [`Vm`]. + pub fn register_memory_region(&mut self, region: GuestRegionMmap) -> Result<(), VmError> { + let next_slot = self + .guest_memory() + .num_regions() + .try_into() + .map_err(|_| VmError::NotEnoughMemorySlots)?; + if next_slot as usize >= self.common.max_memslots { + return Err(VmError::NotEnoughMemorySlots); + } + + let flags = if region.bitmap().is_some() { + KVM_MEM_LOG_DIRTY_PAGES + } else { + 0 + }; + + let memory_region = kvm_userspace_memory_region { + slot: next_slot, + guest_phys_addr: region.start_addr().raw_value(), + memory_size: region.len(), + userspace_addr: region.as_ptr() as u64, + flags, + }; + + let new_guest_memory = self.common.guest_memory.insert_region(Arc::new(region))?; + + // SAFETY: Safe because the fd is a valid KVM file descriptor. + unsafe { + self.fd() + .set_user_memory_region(memory_region) + .map_err(VmError::SetUserMemoryRegion)?; + } + + self.common.guest_memory = new_guest_memory; + Ok(()) } /// Gets a reference to the kvm file descriptor owned by this VM. pub fn fd(&self) -> &VmFd { - &self.fd + &self.common.fd + } + + /// Gets a reference to this [`Vm`]'s [`GuestMemoryMmap`] object + pub fn guest_memory(&self) -> &GuestMemoryMmap { + &self.common.guest_memory + } + + /// Resets the KVM dirty bitmap for each of the guest's memory regions. + pub fn reset_dirty_bitmap(&self) { + self.guest_memory() + .iter() + .zip(0u32..) + .for_each(|(region, slot)| { + let _ = self.fd().get_dirty_log(slot, u64_to_usize(region.len())); + }); + } + + /// Retrieves the KVM dirty bitmap for each of the guest's memory regions. + pub fn get_dirty_bitmap(&self) -> Result { + let mut bitmap: DirtyBitmap = HashMap::new(); + self.guest_memory() + .iter() + .zip(0u32..) + .try_for_each(|(region, slot)| { + self.fd() + .get_dirty_log(slot, u64_to_usize(region.len())) + .map(|bitmap_region| _ = bitmap.insert(slot, bitmap_region)) + })?; + Ok(bitmap) + } + + /// Takes a snapshot of the virtual machine running inside the given [`Vmm`] and saves it to + /// `mem_file_path`. + /// + /// If `snapshot_type` is [`SnapshotType::Diff`], and `mem_file_path` exists and is a snapshot + /// file of matching size, then the diff snapshot will be directly merged into the existing + /// snapshot. Otherwise, existing files are simply overwritten. + pub(crate) fn snapshot_memory_to_file( + &self, + mem_file_path: &Path, + snapshot_type: SnapshotType, + ) -> Result<(), CreateSnapshotError> { + use self::CreateSnapshotError::*; + + // Need to check this here, as we create the file in the line below + let file_existed = mem_file_path.exists(); + + let mut file = OpenOptions::new() + .write(true) + .create(true) + .truncate(false) + .open(mem_file_path) + .map_err(|err| MemoryBackingFile("open", err))?; + + // Determine what size our total memory area is. + let mem_size_mib = mem_size_mib(self.guest_memory()); + let expected_size = mem_size_mib * 1024 * 1024; + + if file_existed { + let file_size = file + .metadata() + .map_err(|e| MemoryBackingFile("get_metadata", e))? + .len(); + + // Here we only truncate the file if the size mismatches. + // - For full snapshots, the entire file's contents will be overwritten anyway. We have + // to avoid truncating here to deal with the edge case where it represents the + // snapshot file from which this very microVM was loaded (as modifying the memory file + // would be reflected in the mmap of the file, meaning a truncate operation would zero + // out guest memory, and thus corrupt the VM). + // - For diff snapshots, we want to merge the diff layer directly into the file. + if file_size != expected_size { + file.set_len(0) + .map_err(|err| MemoryBackingFile("truncate", err))?; + } + } + + // Set the length of the file to the full size of the memory area. + file.set_len(expected_size) + .map_err(|e| MemoryBackingFile("set_length", e))?; + + match snapshot_type { + SnapshotType::Diff => { + let dirty_bitmap = self.get_dirty_bitmap()?; + self.guest_memory().dump_dirty(&mut file, &dirty_bitmap)?; + } + SnapshotType::Full => { + self.guest_memory().dump(&mut file)?; + self.reset_dirty_bitmap(); + self.guest_memory().reset_dirty(); + } + }; + + file.flush() + .map_err(|err| MemoryBackingFile("flush", err))?; + file.sync_all() + .map_err(|err| MemoryBackingFile("sync_all", err)) } } #[cfg(test)] pub(crate) mod tests { + use vm_memory::GuestAddress; + use vm_memory::mmap::MmapRegionBuilder; + use super::*; - use crate::test_utils::single_region_mem; + use crate::test_utils::single_region_mem_raw; use crate::utils::mib_to_bytes; use crate::vstate::kvm::Kvm; - use crate::vstate::memory::GuestMemoryMmap; + use crate::vstate::memory::GuestRegionMmap; // Auxiliary function being used throughout the tests. pub(crate) fn setup_vm() -> (Kvm, Vm) { @@ -147,11 +297,11 @@ pub(crate) mod tests { } // Auxiliary function being used throughout the tests. - pub(crate) fn setup_vm_with_memory(mem_size: usize) -> (Kvm, Vm, GuestMemoryMmap) { - let (kvm, vm) = setup_vm(); - let gm = single_region_mem(mem_size); - vm.memory_init(&gm).unwrap(); - (kvm, vm, gm) + pub(crate) fn setup_vm_with_memory(mem_size: usize) -> (Kvm, Vm) { + let (kvm, mut vm) = setup_vm(); + let gm = single_region_mem_raw(mem_size); + vm.register_memory_regions(gm).unwrap(); + (kvm, vm) } #[test] @@ -162,35 +312,80 @@ pub(crate) mod tests { } #[test] - fn test_vm_memory_init() { - let (_, vm) = setup_vm(); - // Create valid memory region and test that the initialization is successful. - let gm = single_region_mem(0x1000); - vm.memory_init(&gm).unwrap(); - } - - #[test] - fn test_set_kvm_memory_regions() { - let (_, vm) = setup_vm(); - - let gm = single_region_mem(0x1000); - let res = vm.set_kvm_memory_regions(&gm); - res.unwrap(); + fn test_register_memory_regions() { + let (_, mut vm) = setup_vm(); // Trying to set a memory region with a size that is not a multiple of GUEST_PAGE_SIZE // will result in error. - let gm = single_region_mem(0x10); - let res = vm.set_kvm_memory_regions(&gm); + let gm = single_region_mem_raw(0x10); + let res = vm.register_memory_regions(gm); assert_eq!( res.unwrap_err().to_string(), "Cannot set the memory regions: Invalid argument (os error 22)" ); + + let gm = single_region_mem_raw(0x1000); + let res = vm.register_memory_regions(gm); + res.unwrap(); + } + + #[test] + fn test_too_many_regions() { + let (kvm, mut vm) = setup_vm(); + let max_nr_regions = kvm.max_nr_memslots(); + + // SAFETY: valid mmap parameters + let ptr = unsafe { + libc::mmap( + std::ptr::null_mut(), + 0x1000, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, + -1, + 0, + ) + }; + + assert_ne!(ptr, libc::MAP_FAILED); + + for i in 0..=max_nr_regions { + // SAFETY: we assert above that the ptr is valid, and the size matches what we passed to + // mmap + let region = unsafe { + MmapRegionBuilder::new(0x1000) + .with_raw_mmap_pointer(ptr.cast()) + .build() + .unwrap() + }; + + let region = GuestRegionMmap::new(region, GuestAddress(i as u64 * 0x1000)).unwrap(); + + let res = vm.register_memory_region(region); + + if i >= max_nr_regions { + assert!( + matches!(res, Err(VmError::NotEnoughMemorySlots)), + "{:?} at iteration {} - max_nr_memslots: {}", + res, + i, + max_nr_regions + ); + } else { + res.unwrap_or_else(|_| { + panic!( + "to be able to insert more regions in iteration {i} - max_nr_memslots: \ + {max_nr_regions} - num_regions: {}", + vm.guest_memory().num_regions() + ) + }); + } + } } #[test] fn test_create_vcpus() { let vcpu_count = 2; - let (_, mut vm, _) = setup_vm_with_memory(mib_to_bytes(128)); + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); let (vcpu_vec, _) = vm.create_vcpus(vcpu_count).unwrap(); diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 7fe2c80d35d..55fb07c1aae 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -14,8 +14,6 @@ use vmm::rpc_interface::{ }; use vmm::seccomp::get_empty_filters; use vmm::snapshot::Snapshot; -#[cfg(target_arch = "x86_64")] -use vmm::test_utils::dirty_tracking_vmm; use vmm::test_utils::mock_resources::{MockVmResources, NOISY_KERNEL_IMAGE}; use vmm::test_utils::{create_vmm, default_vmm, default_vmm_no_boot}; use vmm::vmm_config::balloon::BalloonDeviceConfig; @@ -112,8 +110,13 @@ fn test_dirty_bitmap_error() { // with errno 2 (ENOENT) because KVM can't find any guest memory regions with dirty // page tracking enabled. assert_eq!( - format!("{:?}", vmm.lock().unwrap().get_dirty_bitmap().err()), - "Some(DirtyBitmap(Error(2)))" + vmm.lock() + .unwrap() + .vm + .get_dirty_bitmap() + .unwrap_err() + .errno(), + 2 ); vmm.lock().unwrap().stop(FcExitCode::Ok); } @@ -122,11 +125,11 @@ fn test_dirty_bitmap_error() { #[cfg(target_arch = "x86_64")] fn test_dirty_bitmap_success() { // The vmm will start with dirty page tracking = ON. - let (vmm, _) = dirty_tracking_vmm(Some(NOISY_KERNEL_IMAGE)); + let (vmm, _) = vmm::test_utils::dirty_tracking_vmm(Some(NOISY_KERNEL_IMAGE)); // Let it churn for a while and dirty some pages... thread::sleep(Duration::from_millis(100)); - let bitmap = vmm.lock().unwrap().get_dirty_bitmap().unwrap(); + let bitmap = vmm.lock().unwrap().vm.get_dirty_bitmap().unwrap(); let num_dirty_pages: u32 = bitmap .values() .map(|bitmap_per_region| { @@ -299,7 +302,7 @@ fn test_snapshot_load_sanity_checks() { snapshot_state_sanity_check(µvm_state).unwrap(); // Remove memory regions. - microvm_state.memory_state.regions.clear(); + microvm_state.vm_state.memory.regions.clear(); // Validate sanity checks fail because there is no mem region in state. assert_eq!(