diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index 6cc58fefc40..65e4ee5fba2 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -1076,6 +1076,11 @@ definitions: description: Aarch64 only. If non-zero, causes Firecracker to allocate a specific memory area of this size (in MiB) which the guest will use as swiotlb region for all I/O. + secret_free: + type: boolean + description: + If enabled, guest memory will be unmapped from the host kernel's address space, providing additional + protection against transitive execution issues. All I/O must then go through a swiotlb region. MemoryBackend: type: object diff --git a/src/vmm/benches/memory_access.rs b/src/vmm/benches/memory_access.rs index fe4f138db2d..e3b6b656302 100644 --- a/src/vmm/benches/memory_access.rs +++ b/src/vmm/benches/memory_access.rs @@ -9,7 +9,7 @@ fn bench_single_page_fault(c: &mut Criterion, configuration: VmResources) { c.bench_function("page_fault", |b| { b.iter_batched( || { - let memory = configuration.allocate_guest_memory().unwrap(); + let memory = configuration.allocate_guest_memory(None).unwrap(); // Get a pointer to the first memory region (cannot do `.get_slice(GuestAddress(0), // 1)`, because on ARM64 guest memory does not start at physical // address 0). diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index 7df5f83e6a7..8f2247bbd42 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -18,11 +18,11 @@ pub mod vm; use std::cmp::min; use std::fmt::Debug; -use std::fs::File; +use std::io::{Read, Seek}; use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::{Cmdline, KernelLoader}; -use vm_memory::GuestMemoryError; +use vm_memory::{GuestMemoryError, ReadVolatile}; use crate::arch::{BootProtocol, EntryPoint}; use crate::cpu_config::aarch64::{CpuConfiguration, CpuConfigurationError}; @@ -204,16 +204,10 @@ fn get_fdt_addr(mem: &GuestMemoryMmap) -> u64 { } /// Load linux kernel into guest memory. -pub fn load_kernel( - kernel: &File, +pub fn load_kernel( + mut kernel_file: R, guest_memory: &GuestMemoryMmap, ) -> Result { - // Need to clone the File because reading from it - // mutates it. - let mut kernel_file = kernel - .try_clone() - .map_err(|_| ConfigurationError::KernelFile)?; - let entry_addr = Loader::load( guest_memory, Some(GuestAddress(get_kernel_start())), diff --git a/src/vmm/src/arch/aarch64/vm.rs b/src/vmm/src/arch/aarch64/vm.rs index 1a3d3d94e17..e7da4b64772 100644 --- a/src/vmm/src/arch/aarch64/vm.rs +++ b/src/vmm/src/arch/aarch64/vm.rs @@ -8,6 +8,14 @@ use crate::arch::aarch64::gic::GicState; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; use crate::vstate::vm::{VmCommon, VmError}; +/// The VM type for this architecture that allows us to use guest_memfd. On ARM, all VMs +/// support guest_memfd and no special type is needed (in fact, no concept of vm types really +/// exists, and the correspoding field of the CREATE_VM ioctl determines IPA size instead, +/// e.g. the size of the guest physical address space. This value cannot be hardcoded, hence +/// `None` to let the `Vm` constructor now that just normal [`Kvm::create_vm`] should be called, +/// which internally determines the preferred IPA size. +pub const VM_TYPE_FOR_SECRET_FREEDOM: Option = None; + /// Structure representing the current architecture's understand of what a "virtual machine" is. #[derive(Debug)] pub struct ArchVm { @@ -30,8 +38,8 @@ pub enum ArchVmError { impl ArchVm { /// Create a new `Vm` struct. - pub fn new(kvm: &Kvm) -> Result { - let common = Self::create_common(kvm)?; + pub fn new(kvm: &Kvm, vm_type: Option) -> Result { + let common = Self::create_common(kvm, vm_type)?; Ok(ArchVm { common, irqchip_handle: None, diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index 0046ad400e5..c74cbc1407d 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -17,7 +17,7 @@ pub use aarch64::kvm::{Kvm, KvmArchError, OptionalCapabilities}; #[cfg(target_arch = "aarch64")] pub use aarch64::vcpu::*; #[cfg(target_arch = "aarch64")] -pub use aarch64::vm::{ArchVm, ArchVmError, VmState}; +pub use aarch64::vm::{ArchVm, ArchVmError, VM_TYPE_FOR_SECRET_FREEDOM, VmState}; #[cfg(target_arch = "aarch64")] pub use aarch64::{ ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, @@ -35,7 +35,7 @@ pub use x86_64::kvm::{Kvm, KvmArchError}; #[cfg(target_arch = "x86_64")] pub use x86_64::vcpu::*; #[cfg(target_arch = "x86_64")] -pub use x86_64::vm::{ArchVm, ArchVmError, VmState}; +pub use x86_64::vm::{ArchVm, ArchVmError, VM_TYPE_FOR_SECRET_FREEDOM, VmState}; #[cfg(target_arch = "x86_64")] pub use crate::arch::x86_64::{ diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index c982303a2db..a6f68c4c112 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -31,7 +31,7 @@ pub mod xstate; #[allow(missing_docs)] pub mod generated; -use std::fs::File; +use std::io::{Read, Seek}; use layout::CMDLINE_START; use linux_loader::configurator::linux::LinuxBootConfigurator; @@ -44,6 +44,7 @@ use linux_loader::loader::elf::start_info::{ }; use linux_loader::loader::{Cmdline, KernelLoader, PvhBootCapability, load_cmdline}; use log::debug; +use vm_memory::ReadVolatile; use super::EntryPoint; use crate::acpi::create_acpi_tables; @@ -447,20 +448,14 @@ fn add_e820_entry( } /// Load linux kernel into guest memory. -pub fn load_kernel( - kernel: &File, +pub fn load_kernel( + mut kernel: R, guest_memory: &GuestMemoryMmap, ) -> Result { - // Need to clone the File because reading from it - // mutates it. - let mut kernel_file = kernel - .try_clone() - .map_err(|_| ConfigurationError::KernelFile)?; - let entry_addr = Loader::load( guest_memory, None, - &mut kernel_file, + &mut kernel, Some(GuestAddress(get_kernel_start())), ) .map_err(ConfigurationError::KernelLoader)?; diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index b186344cbb2..f9cfeaadaf1 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -5,16 +5,21 @@ use std::fmt; use kvm_bindings::{ KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, - KVM_PIT_SPEAKER_DUMMY, MsrList, kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, + KVM_PIT_SPEAKER_DUMMY, KVM_X86_SW_PROTECTED_VM, MsrList, kvm_clock_data, kvm_irqchip, + kvm_pit_config, kvm_pit_state2, }; use kvm_ioctls::Cap; use serde::{Deserialize, Serialize}; +use crate::arch::Kvm; use crate::arch::x86_64::msr::MsrError; use crate::utils::u64_to_usize; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; use crate::vstate::vm::{VmCommon, VmError}; +/// The VM type for this architecture that allows us to use guest_memfd. +pub const VM_TYPE_FOR_SECRET_FREEDOM: Option = Some(KVM_X86_SW_PROTECTED_VM as u64); + /// Error type for [`Vm::restore_state`] #[allow(missing_docs)] #[cfg(target_arch = "x86_64")] @@ -60,8 +65,8 @@ pub struct ArchVm { impl ArchVm { /// Create a new `Vm` struct. - pub fn new(kvm: &crate::vstate::kvm::Kvm) -> Result { - let common = Self::create_common(kvm)?; + pub fn new(kvm: &Kvm, vm_type: Option) -> Result { + let common = Self::create_common(kvm, vm_type)?; let msrs_to_save = kvm.msrs_to_save().map_err(ArchVmError::GetMsrsToSave)?; diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index fff9afdacf5..4838ab0eb51 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -5,6 +5,7 @@ use std::fmt::Debug; use std::io::{self}; +use std::os::unix::fs::MetadataExt; #[cfg(feature = "gdb")] use std::sync::mpsc; use std::sync::{Arc, Mutex}; @@ -19,7 +20,9 @@ use vm_superio::Rtc; use vm_superio::Serial; use vmm_sys_util::eventfd::EventFd; -use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; +use crate::arch::{ + ConfigurationError, VM_TYPE_FOR_SECRET_FREEDOM, configure_system_for_boot, load_kernel, +}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::cpu_config::templates::{ @@ -57,12 +60,14 @@ use crate::persist::{ use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; use crate::snapshot::Persist; +use crate::utils::u64_to_usize; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; use crate::vmm_config::snapshot::{MemBackendConfig, MemBackendType}; use crate::vstate::kvm::Kvm; +use crate::vstate::memory::Bounce; use crate::vstate::vcpu::{Vcpu, VcpuError}; -use crate::vstate::vm::Vm; +use crate::vstate::vm::{KVM_GMEM_NO_DIRECT_MAP, Vm}; use crate::{EventManager, Vmm, VmmError, device_manager}; /// Errors associated with starting the instance. @@ -139,11 +144,12 @@ fn create_vmm_and_vcpus( event_manager: &mut EventManager, vcpu_count: u8, kvm_capabilities: Vec, + vm_type: Option, ) -> Result<(Vmm, Vec), VmmError> { let kvm = Kvm::new(kvm_capabilities)?; // Set up Kvm Vm and register memory regions. // Build custom CPU config if a custom template is provided. - let mut vm = Vm::new(&kvm)?; + let mut vm = Vm::new(&kvm, vm_type)?; let resource_allocator = ResourceAllocator::new()?; @@ -214,14 +220,6 @@ pub fn build_microvm_for_boot( .as_ref() .ok_or(MissingKernelConfig)?; - let guest_memory = vm_resources - .allocate_guest_memory() - .map_err(StartMicrovmError::GuestMemory)?; - - let swiotlb = vm_resources - .allocate_swiotlb_region() - .map_err(StartMicrovmError::GuestMemory)?; - // Clone the command-line so that a failed boot doesn't pollute the original. #[allow(unused_mut)] let mut boot_cmdline = boot_config.cmdline.clone(); @@ -231,25 +229,69 @@ pub fn build_microvm_for_boot( .cpu_template .get_cpu_template()?; + let secret_free = vm_resources.machine_config.mem_config.secret_free; + let vm_type = match secret_free { + true => VM_TYPE_FOR_SECRET_FREEDOM, + false => None, + }; + let (mut vmm, mut vcpus) = create_vmm_and_vcpus( instance_info, event_manager, vm_resources.machine_config.vcpu_count, cpu_template.kvm_capabilities.clone(), + vm_type, )?; + let guest_memfd = match secret_free { + true => Some( + vmm.vm + .create_guest_memfd(vm_resources.memory_size(), KVM_GMEM_NO_DIRECT_MAP) + .map_err(VmmError::Vm)?, + ), + false => None, + }; + + let guest_memory = vm_resources + .allocate_guest_memory(guest_memfd) + .map_err(StartMicrovmError::GuestMemory)?; + + let swiotlb = vm_resources + .allocate_swiotlb_region() + .map_err(StartMicrovmError::GuestMemory)?; + vmm.vm - .register_memory_regions(guest_memory) + .register_memory_regions(guest_memory, secret_free) .map_err(VmmError::Vm)?; + #[cfg(target_arch = "x86_64")] + vmm.vm.set_memory_private().map_err(VmmError::Vm)?; + if let Some(swiotlb) = swiotlb { vmm.vm .register_swiotlb_region(swiotlb) .map_err(VmmError::Vm)?; } - let entry_point = load_kernel(&boot_config.kernel_file, vmm.vm.guest_memory())?; - let initrd = InitrdConfig::from_config(boot_config, vmm.vm.guest_memory())?; + let entry_point = load_kernel( + Bounce(&boot_config.kernel_file, secret_free), + vmm.vm.guest_memory(), + )?; + let initrd = match &boot_config.initrd_file { + Some(initrd_file) => { + let size = initrd_file + .metadata() + .map_err(InitrdError::Metadata)? + .size(); + + Some(InitrdConfig::from_reader( + vmm.vm.guest_memory(), + Bounce(initrd_file, secret_free), + u64_to_usize(size), + )?) + } + None => None, + }; #[cfg(feature = "gdb")] let (gdb_tx, gdb_rx) = mpsc::channel(); @@ -439,6 +481,11 @@ pub fn build_microvm_from_snapshot( event_manager, vm_resources.machine_config.vcpu_count, microvm_state.kvm_state.kvm_cap_modifiers.clone(), + if vm_resources.machine_config.mem_config.secret_free { + VM_TYPE_FOR_SECRET_FREEDOM + } else { + None + }, ) .map_err(StartMicrovmError::Internal)?; @@ -467,8 +514,9 @@ pub fn build_microvm_from_snapshot( guest_memory.iter().map(|r| r.len()).sum(), )?; + // TODO: sort out gmem support for snapshot restore vmm.vm - .register_memory_regions(guest_memory) + .register_memory_regions(guest_memory, false) .map_err(VmmError::Vm) .map_err(StartMicrovmError::Internal)?; diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 8b2f5b5c387..60c4c74e20d 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -655,8 +655,8 @@ mod tests { let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - let mut vm = Vm::new(&kvm).unwrap(); - vm.register_memory_regions(guest_mem).unwrap(); + let mut vm = Vm::new(&kvm, None).unwrap(); + vm.register_memory_regions(guest_mem, false).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -686,8 +686,8 @@ mod tests { let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - let mut vm = Vm::new(&kvm).unwrap(); - vm.register_memory_regions(guest_mem).unwrap(); + let mut vm = Vm::new(&kvm, None).unwrap(); + vm.register_memory_regions(guest_mem, false).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -742,8 +742,8 @@ mod tests { let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - let mut vm = Vm::new(&kvm).unwrap(); - vm.register_memory_regions(guest_mem).unwrap(); + let mut vm = Vm::new(&kvm, None).unwrap(); + vm.register_memory_regions(guest_mem, false).unwrap(); #[cfg(target_arch = "x86_64")] vm.setup_irqchip().unwrap(); diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index 96e099e7202..c754012baec 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -240,7 +240,7 @@ pub mod tests { ) .unwrap() .into_iter() - .map(|region| KvmRegion::from_mmap_region(region, 0)) + .map(|region| KvmRegion::from_mmap_region(region, 0, None)) .collect(), ) .unwrap() diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index 9b21ec7a506..5e028411061 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -482,7 +482,7 @@ pub(crate) mod tests { ) .unwrap() .into_iter() - .map(|region| KvmRegion::from_mmap_region(region, 0)) + .map(|region| KvmRegion::from_mmap_region(region, 0, None)) .collect(), ) .unwrap() diff --git a/src/vmm/src/initrd.rs b/src/vmm/src/initrd.rs index 9dfcd8bc16e..624ec397f73 100644 --- a/src/vmm/src/initrd.rs +++ b/src/vmm/src/initrd.rs @@ -1,14 +1,9 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::fs::File; -use std::os::unix::fs::MetadataExt; - use vm_memory::{GuestAddress, GuestMemory, ReadVolatile, VolatileMemoryError}; use crate::arch::initrd_load_addr; -use crate::utils::u64_to_usize; -use crate::vmm_config::boot_source::BootConfig; use crate::vstate::memory::GuestMemoryMmap; /// Errors associated with initrd loading. @@ -20,8 +15,6 @@ pub enum InitrdError { Load, /// Cannot image metadata: {0} Metadata(std::io::Error), - /// Cannot copy initrd file fd: {0} - CloneFd(std::io::Error), /// Cannot load initrd due to an invalid image: {0} Read(VolatileMemoryError), } @@ -36,31 +29,20 @@ pub struct InitrdConfig { } impl InitrdConfig { - /// Load initrd into guest memory based on the boot config. - pub fn from_config( - boot_cfg: &BootConfig, - vm_memory: &GuestMemoryMmap, - ) -> Result, InitrdError> { - Ok(match &boot_cfg.initrd_file { - Some(f) => { - let f = f.try_clone().map_err(InitrdError::CloneFd)?; - Some(Self::from_file(vm_memory, f)?) - } - None => None, - }) - } - /// Loads the initrd from a file into guest memory. - pub fn from_file(vm_memory: &GuestMemoryMmap, mut file: File) -> Result { - let size = file.metadata().map_err(InitrdError::Metadata)?.size(); - let size = u64_to_usize(size); + pub fn from_reader( + vm_memory: &GuestMemoryMmap, + mut reader: R, + size: usize, + ) -> Result { let Some(address) = initrd_load_addr(vm_memory, size) else { return Err(InitrdError::Address); }; let mut slice = vm_memory .get_slice(GuestAddress(address), size) .map_err(|_| InitrdError::Load)?; - file.read_exact_volatile(&mut slice) + reader + .read_exact_volatile(&mut slice) .map_err(InitrdError::Read)?; Ok(InitrdConfig { @@ -105,7 +87,7 @@ mod tests { // Need to reset the cursor to read initrd properly. tempfile.seek(SeekFrom::Start(0)).unwrap(); - let initrd = InitrdConfig::from_file(&gm, tempfile).unwrap(); + let initrd = InitrdConfig::from_reader(&gm, tempfile, image.len()).unwrap(); assert!(gm.address_in_range(initrd.address)); assert_eq!(initrd.size, image.len()); } @@ -120,7 +102,7 @@ mod tests { // Need to reset the cursor to read initrd properly. tempfile.seek(SeekFrom::Start(0)).unwrap(); - let res = InitrdConfig::from_file(&gm, tempfile); + let res = InitrdConfig::from_reader(&gm, tempfile, image.len()); assert!(matches!(res, Err(InitrdError::Address)), "{:?}", res); } @@ -134,7 +116,7 @@ mod tests { // Need to reset the cursor to read initrd properly. tempfile.seek(SeekFrom::Start(0)).unwrap(); - let res = InitrdConfig::from_file(&gm, tempfile); + let res = InitrdConfig::from_reader(&gm, tempfile, image.len()); assert!(matches!(res, Err(InitrdError::Address)), "{:?}", res); } } diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index a5d64cd8b32..e0a677b7c3a 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -330,7 +330,7 @@ pub fn restore_memory( } let mem_file = File::open(path)?; - memory::snapshot_file(mem_file, state.regions(), track_dirty, offset)? + memory::file_private(mem_file, state.regions(), track_dirty, offset)? } None => memory::anonymous(state.regions(), track_dirty, huge_pages)?, }; diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 0cfc20f8f5d..51e0fd524d3 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::convert::From; +use std::fs::File; use std::path::PathBuf; use std::sync::{Arc, Mutex, MutexGuard}; @@ -30,7 +31,7 @@ use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::*; use crate::vmm_config::vsock::*; use crate::vstate::memory; -use crate::vstate::memory::{GuestRegionMmap, MemoryError}; +use crate::vstate::memory::{GuestRegionMmap, MemoryError, create_memfd}; /// Errors encountered when configuring microVM resources. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -110,6 +111,14 @@ pub struct VmResources { } impl VmResources { + /// Whether this [`VmResources`] object contains any devices that require host kernel access + /// into guest memory. + pub fn has_any_io_devices(&self) -> bool { + !self.block.devices.is_empty() + || self.vsock.get().is_some() + || self.net_builder.iter().next().is_some() + } + /// Configures Vmm resources as described by the `config_json` param. pub fn from_json( config_json: &str, @@ -213,7 +222,14 @@ impl VmResources { self.balloon.set_device(balloon); if self.machine_config.huge_pages != HugePageConfig::None { - return Err(ResourcesError::BalloonDevice(BalloonConfigError::HugePages)); + return Err(ResourcesError::BalloonDevice( + BalloonConfigError::IncompatibleWith("huge pages"), + )); + } + if self.machine_config.mem_config.secret_free { + return Err(ResourcesError::BalloonDevice( + BalloonConfigError::IncompatibleWith("secret freedom"), + )); } } @@ -254,8 +270,27 @@ impl VmResources { return Err(MachineConfigError::IncompatibleBalloonSize); } + if self.has_any_io_devices() + && self.machine_config.mem_config.secret_free + && !self.swiotlb_used() + { + return Err(MachineConfigError::Incompatible( + "secret freedom", + "I/O without swiotlb", + )); + } + if self.balloon.get().is_some() && updated.huge_pages != HugePageConfig::None { - return Err(MachineConfigError::BalloonAndHugePages); + return Err(MachineConfigError::Incompatible( + "balloon device", + "huge pages", + )); + } + if self.balloon.get().is_some() && updated.mem_config.secret_free { + return Err(MachineConfigError::Incompatible( + "balloon device", + "secret freedom", + )); } self.machine_config = updated; @@ -312,7 +347,11 @@ impl VmResources { } if self.machine_config.huge_pages != HugePageConfig::None { - return Err(BalloonConfigError::HugePages); + return Err(BalloonConfigError::IncompatibleWith("huge pages")); + } + + if self.machine_config.mem_config.secret_free { + return Err(BalloonConfigError::IncompatibleWith("secret freedom")); } self.balloon.set(config) @@ -338,6 +377,13 @@ impl VmResources { &mut self, block_device_config: BlockDeviceConfig, ) -> Result<(), DriveError> { + if self.has_any_io_devices() + && self.machine_config.mem_config.secret_free + && !self.swiotlb_used() + { + return Err(DriveError::SecretFreeWithoutSwiotlb); + } + self.block.insert(block_device_config) } @@ -346,12 +392,26 @@ impl VmResources { &mut self, body: NetworkInterfaceConfig, ) -> Result<(), NetworkInterfaceError> { + if self.has_any_io_devices() + && self.machine_config.mem_config.secret_free + && !self.swiotlb_used() + { + return Err(NetworkInterfaceError::SecretFreeWithoutSwiotlb); + } + let _ = self.net_builder.build(body)?; Ok(()) } /// Sets a vsock device to be attached when the VM starts. pub fn set_vsock_device(&mut self, config: VsockDeviceConfig) -> Result<(), VsockConfigError> { + if self.has_any_io_devices() + && self.machine_config.mem_config.secret_free + && !self.swiotlb_used() + { + return Err(VsockConfigError::SecretFreeWithoutSwiotlb); + } + self.vsock.insert(config) } @@ -473,20 +533,34 @@ impl VmResources { offset: usize, size: usize, vhost_accessible: bool, + file: Option, ) -> Result, MemoryError> { - let regions = crate::arch::arch_memory_regions(offset, size); - if vhost_accessible { - memory::memfd_backed( - regions.as_ref(), - self.machine_config.track_dirty_pages, - self.machine_config.huge_pages, - ) - } else { - memory::anonymous( - regions.into_iter(), + let regions = crate::arch::arch_memory_regions(offset, size).into_iter(); + match file { + Some(file) => memory::file_shared( + file, + regions, self.machine_config.track_dirty_pages, self.machine_config.huge_pages, - ) + ), + None => { + if vhost_accessible { + let memfd = create_memfd(size as u64, self.machine_config.huge_pages.into())? + .into_file(); + memory::file_shared( + memfd, + regions, + self.machine_config.track_dirty_pages, + self.machine_config.huge_pages, + ) + } else { + memory::anonymous( + regions.into_iter(), + self.machine_config.track_dirty_pages, + self.machine_config.huge_pages, + ) + } + } } } @@ -494,7 +568,10 @@ impl VmResources { /// /// If vhost-user-blk devices are in use, allocates memfd-backed shared memory, otherwise /// prefers anonymous memory for performance reasons. - pub fn allocate_guest_memory(&self) -> Result, MemoryError> { + pub fn allocate_guest_memory( + &self, + guest_memfd: Option, + ) -> Result, MemoryError> { // Page faults are more expensive for shared memory mapping, including memfd. // For this reason, we only back guest memory with a memfd // if a vhost-user-blk device is configured in the VM, otherwise we fall back to @@ -511,6 +588,7 @@ impl VmResources { 0, self.memory_size(), self.vhost_user_devices_used() && !self.swiotlb_used(), + guest_memfd, ) } @@ -523,8 +601,12 @@ impl VmResources { let start = self.memory_size(); let start = start.max(crate::arch::offset_after_last_gap()); - let mut mem = - self.allocate_memory(start, self.swiotlb_size(), self.vhost_user_devices_used())?; + let mut mem = self.allocate_memory( + start, + self.swiotlb_size(), + self.vhost_user_devices_used(), + None, + )?; assert_eq!(mem.len(), 1); @@ -1452,7 +1534,7 @@ mod tests { assert!( matches!( err, - ResourcesError::BalloonDevice(BalloonConfigError::HugePages) + ResourcesError::BalloonDevice(BalloonConfigError::IncompatibleWith("huge pages")) ), "{:?}", err @@ -1595,7 +1677,7 @@ mod tests { ..Default::default() }; - let normal_mem = resources.allocate_guest_memory().unwrap(); + let normal_mem = resources.allocate_guest_memory(None).unwrap(); assert_eq!( normal_mem.iter().map(|r| r.len()).sum::(), mib_to_bytes(16) diff --git a/src/vmm/src/test_utils/mod.rs b/src/vmm/src/test_utils/mod.rs index 9bec192fc0e..7f938ec7363 100644 --- a/src/vmm/src/test_utils/mod.rs +++ b/src/vmm/src/test_utils/mod.rs @@ -48,7 +48,7 @@ pub fn multi_region_mem(regions: &[(GuestAddress, usize)]) -> GuestMemoryMmap { memory::anonymous(regions.iter().copied(), false, HugePageConfig::None) .expect("Cannot initialize memory") .into_iter() - .map(|region| KvmRegion::from_mmap_region(region, 0)) + .map(|region| KvmRegion::from_mmap_region(region, 0, None)) .collect(), ) .unwrap() diff --git a/src/vmm/src/vmm_config/balloon.rs b/src/vmm/src/vmm_config/balloon.rs index 6ac2fb34ecf..a6fccfe2b4b 100644 --- a/src/vmm/src/vmm_config/balloon.rs +++ b/src/vmm/src/vmm_config/balloon.rs @@ -28,8 +28,8 @@ pub enum BalloonConfigError { CreateFailure(crate::devices::virtio::balloon::BalloonError), /// Error updating the balloon device configuration: {0} UpdateFailure(std::io::Error), - /// Firecracker's huge pages support is incompatible with memory ballooning. - HugePages, + /// Memory ballooning is incompatible with {0}. + IncompatibleWith(&'static str), } /// This struct represents the strongly typed equivalent of the json body diff --git a/src/vmm/src/vmm_config/drive.rs b/src/vmm/src/vmm_config/drive.rs index 9e301eff751..4c456c95097 100644 --- a/src/vmm/src/vmm_config/drive.rs +++ b/src/vmm/src/vmm_config/drive.rs @@ -24,6 +24,8 @@ pub enum DriveError { DeviceUpdate(VmmError), /// A root block device already exists! RootBlockDeviceAlreadyAdded, + /// A block device was added to a secret free VM that doesnt have a swiotlb region. + SecretFreeWithoutSwiotlb, } /// Use this structure to set up the Block Device before booting the kernel. diff --git a/src/vmm/src/vmm_config/machine_config.rs b/src/vmm/src/vmm_config/machine_config.rs index 80b1ce9ada8..b082e9fc900 100644 --- a/src/vmm/src/vmm_config/machine_config.rs +++ b/src/vmm/src/vmm_config/machine_config.rs @@ -30,10 +30,8 @@ pub enum MachineConfigError { /// Enabling simultaneous multithreading is not supported on aarch64. #[cfg(target_arch = "aarch64")] SmtNotSupported, - /// Could not determine host kernel version when checking hugetlbfs compatibility - KernelVersion, - /// Firecracker's huge pages support is incompatible with memory ballooning. - BalloonAndHugePages, + /// '{0}' and '{1}' are mutually exclusive and cannot be used together. + Incompatible(&'static str, &'static str) } /// Describes the possible (huge)page configurations for a microVM's memory. @@ -101,6 +99,11 @@ pub struct MemoryConfig { #[cfg(target_arch = "aarch64")] #[serde(default)] pub initial_swiotlb_size: usize, + /// Whether guest_memfd should be used to back normal guest memory. If this is enabled + /// and any devices are attached to the VM, then initial_swiotlb_size must be non-zero, + /// as I/O into secret free memory is not possible. + #[serde(default)] + pub secret_free: bool, } /// Struct used in PUT `/machine-config` API call. @@ -291,6 +294,7 @@ impl MachineConfig { let mem_size_mib = update.mem_size_mib.unwrap_or(self.mem_size_mib); let page_config = update.huge_pages.unwrap_or(self.huge_pages); let mem_config = update.mem_config.unwrap_or(self.mem_config); + let track_dirty_pages = update.track_dirty_pages.unwrap_or(self.track_dirty_pages); if mem_size_mib == 0 || !page_config.is_valid_mem_size(mem_size_mib) { return Err(MachineConfigError::InvalidMemorySize); @@ -303,6 +307,20 @@ impl MachineConfig { return Err(MachineConfigError::InvalidSwiotlbRegionSize); } + if mem_config.secret_free && page_config != HugePageConfig::None { + return Err(MachineConfigError::Incompatible( + "secret freedom", + "huge pages", + )); + } + + if mem_config.secret_free && track_dirty_pages { + return Err(MachineConfigError::Incompatible( + "secret freedom", + "diff snapshots", + )); + } + let cpu_template = match update.cpu_template { None => self.cpu_template.clone(), Some(StaticCpuTemplate::None) => None, @@ -315,7 +333,7 @@ impl MachineConfig { mem_config, smt, cpu_template, - track_dirty_pages: update.track_dirty_pages.unwrap_or(self.track_dirty_pages), + track_dirty_pages, huge_pages: page_config, #[cfg(feature = "gdb")] gdb_socket_path: update.gdb_socket_path.clone(), @@ -327,7 +345,7 @@ impl MachineConfig { mod tests { use crate::cpu_config::templates::{CpuTemplateType, CustomCpuTemplate, StaticCpuTemplate}; use crate::vmm_config::machine_config::{ - HugePageConfig, MachineConfig, MachineConfigError, MachineConfigUpdate, + HugePageConfig, MachineConfig, MachineConfigError, MachineConfigUpdate, MemoryConfig, }; #[test] @@ -381,6 +399,38 @@ mod tests { .unwrap(); assert_eq!(updated.huge_pages, HugePageConfig::Hugetlbfs2M); assert_eq!(updated.mem_size_mib, 32); + + let res = mconf.update(&MachineConfigUpdate { + huge_pages: Some(HugePageConfig::Hugetlbfs2M), + mem_config: Some(MemoryConfig { + secret_free: true, + ..Default::default() + }), + ..Default::default() + }); + assert_eq!( + res, + Err(MachineConfigError::Incompatible( + "secret freedom", + "huge pages" + )) + ); + + let res = mconf.update(&MachineConfigUpdate { + track_dirty_pages: Some(true), + mem_config: Some(MemoryConfig { + secret_free: true, + ..Default::default() + }), + ..Default::default() + }); + assert_eq!( + res, + Err(MachineConfigError::Incompatible( + "secret freedom", + "diff snapshots" + )) + ); } #[test] diff --git a/src/vmm/src/vmm_config/net.rs b/src/vmm/src/vmm_config/net.rs index f1413368090..fd83328109e 100644 --- a/src/vmm/src/vmm_config/net.rs +++ b/src/vmm/src/vmm_config/net.rs @@ -71,6 +71,8 @@ pub enum NetworkInterfaceError { GuestMacAddressInUse(String), /// Cannot open/create the tap device: {0} OpenTap(#[from] TapError), + /// A net device was added to a secret free VM that doesnt have a swiotlb region. + SecretFreeWithoutSwiotlb, } /// Builder for a list of network devices. diff --git a/src/vmm/src/vmm_config/vsock.rs b/src/vmm/src/vmm_config/vsock.rs index 920e4a4d217..6793bac572e 100644 --- a/src/vmm/src/vmm_config/vsock.rs +++ b/src/vmm/src/vmm_config/vsock.rs @@ -17,6 +17,8 @@ pub enum VsockConfigError { CreateVsockBackend(VsockUnixBackendError), /// Cannot create vsock device: {0} CreateVsockDevice(VsockError), + /// A vsock device was added to a secret free VM that doesnt have a swiotlb region. + SecretFreeWithoutSwiotlb, } /// This struct represents the strongly typed equivalent of the json body diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 7c0c31ac344..22c1935512f 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -6,11 +6,13 @@ // found in the THIRD-PARTY file. use std::fs::File; -use std::io::SeekFrom; +use std::io::{Read, Seek, SeekFrom, Write}; use std::mem::ManuallyDrop; +use std::os::fd::{AsFd, AsRawFd}; +use std::ptr::null_mut; use std::sync::Arc; -use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; +use kvm_bindings::{KVM_MEM_GUEST_MEMFD, KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region2}; use serde::{Deserialize, Serialize}; pub use vm_memory::bitmap::{AtomicBitmap, BS, Bitmap, BitmapSlice}; pub use vm_memory::mmap::MmapRegionBuilder; @@ -20,7 +22,10 @@ pub use vm_memory::{ Address, ByteValued, Bytes, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegion, GuestUsize, MemoryRegionAddress, MmapRegion, address, }; -use vm_memory::{Error as VmMemoryError, GuestMemoryError, VolatileSlice, WriteVolatile}; +use vm_memory::{ + Error as VmMemoryError, GuestMemoryError, ReadVolatile, VolatileMemoryError, VolatileSlice, + WriteVolatile, +}; use vmm_sys_util::errno; use crate::DirtyBitmap; @@ -51,12 +56,69 @@ pub enum MemoryError { MemfdSetLen(std::io::Error), /// Total sum of memory regions exceeds largest possible file offset OffsetTooLarge, + /// Error calling mmap: {0} + Mmap(std::io::Error), +} + +/// Newtype that implements [`ReadVolatile`] and [`WriteVolatile`] if `T` implements `Read` or +/// `Write` respectively, by reading/writing using a bounce buffer, and memcpy-ing into the +/// [`VolatileSlice`]. +#[derive(Debug)] +pub struct Bounce(pub T, pub bool); + +// FIXME: replace AsFd with ReadVolatile once &File: ReadVolatile in vm-memory. +impl ReadVolatile for Bounce { + fn read_volatile( + &mut self, + buf: &mut VolatileSlice, + ) -> Result { + if self.1 { + let mut bbuf = vec![0; buf.len()]; + let n = self + .0 + .read(bbuf.as_mut_slice()) + .map_err(VolatileMemoryError::IOError)?; + buf.copy_from(&bbuf[..n]); + Ok(n) + } else { + self.0.as_fd().read_volatile(buf) + } + } +} + +impl WriteVolatile for Bounce { + fn write_volatile( + &mut self, + buf: &VolatileSlice, + ) -> Result { + if self.1 { + let mut bbuf = vec![0; buf.len()]; + buf.copy_to(bbuf.as_mut_slice()); + self.0 + .write(bbuf.as_slice()) + .map_err(VolatileMemoryError::IOError) + } else { + self.0.as_fd().write_volatile(buf) + } + } +} + +impl Read for Bounce { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.0.read(buf) + } +} + +impl Seek for Bounce { + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + self.0.seek(pos) + } } /// A memory region, described in terms of `kvm_userspace_memory_region` #[derive(Debug)] pub struct KvmRegion { - region: kvm_userspace_memory_region, + region: kvm_userspace_memory_region2, bitmap: Option, file_offset: Option, } @@ -70,7 +132,7 @@ impl KvmRegion { /// `kvm_region.userspace_addr as *mut u8` is valid for reads and writes of length /// `kvm_region.memory_size`. pub unsafe fn new( - region: kvm_userspace_memory_region, + region: kvm_userspace_memory_region2, bitmap: Option, file_offset: Option, ) -> Self { @@ -81,13 +143,24 @@ impl KvmRegion { } } - pub(crate) fn from_mmap_region(region: GuestRegionMmap, slot: u32) -> Self { + pub(crate) fn from_mmap_region( + region: GuestRegionMmap, + slot: u32, + guest_memfd: Option, + ) -> Self { let region = ManuallyDrop::new(region); - let flags = if region.bitmap().is_some() { - KVM_MEM_LOG_DIRTY_PAGES - } else { - 0 - }; + let mut flags = 0; + if region.bitmap().is_some() { + flags |= KVM_MEM_LOG_DIRTY_PAGES; + } + if guest_memfd.is_some() { + flags |= KVM_MEM_GUEST_MEMFD; + } + + #[allow(clippy::cast_sign_loss)] + let (guest_memfd, guest_memfd_offset) = guest_memfd + .map(|fo| (fo.file().as_raw_fd() as u32, fo.start())) + .unwrap_or((0, 0)); // SAFETY: `GuestRegionMmap` is essentially a fat pointer, and ensures that // region.as_ptr() is valid for reads and writes of length region.len(), @@ -95,12 +168,15 @@ impl KvmRegion { // impl won't run and free the memory away from underneath us. unsafe { Self::new( - kvm_userspace_memory_region { + kvm_userspace_memory_region2 { slot, flags, guest_phys_addr: region.start_addr().0, memory_size: region.len(), userspace_addr: region.as_ptr() as u64, + guest_memfd, + guest_memfd_offset, + ..Default::default() }, region.bitmap().clone(), region.file_offset().cloned(), @@ -108,7 +184,7 @@ impl KvmRegion { } } - pub(crate) fn inner(&self) -> &kvm_userspace_memory_region { + pub(crate) fn inner(&self) -> &kvm_userspace_memory_region2 { &self.region } } @@ -183,16 +259,40 @@ pub fn create( let mut builder = MmapRegionBuilder::new_with_bitmap( size, track_dirty_pages.then(|| AtomicBitmap::with_len(size)), - ) - .with_mmap_prot(libc::PROT_READ | libc::PROT_WRITE) - .with_mmap_flags(libc::MAP_NORESERVE | mmap_flags); + ); - if let Some(ref file) = file { + // when computing offset below we ensure it fits into i64 + #[allow(clippy::cast_possible_wrap)] + let (fd, fd_off) = if let Some(ref file) = file { let file_offset = FileOffset::from_arc(Arc::clone(file), offset); builder = builder.with_file_offset(file_offset); + + (file.as_raw_fd(), offset as libc::off_t) + } else { + (-1, 0) + }; + + // SAFETY: the arguments to mmap cannot cause any memory unsafety in the rust sense + let ptr = unsafe { + libc::mmap( + null_mut(), + size, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_NORESERVE | mmap_flags, + fd, + fd_off, + ) + }; + + if ptr == libc::MAP_FAILED { + return Err(MemoryError::Mmap(std::io::Error::last_os_error())); } + // SAFETY: we check above that mmap succeeded, and the size we passed to builder is the + // same as the size of the mmap area. + let builder = unsafe { builder.with_raw_mmap_pointer(ptr.cast()) }; + offset = match offset.checked_add(size as u64) { None => return Err(MemoryError::OffsetTooLarge), Some(new_off) if new_off >= i64::MAX as u64 => { @@ -211,18 +311,16 @@ pub fn create( } /// Creates a GuestMemoryMmap with `size` in MiB backed by a memfd. -pub fn memfd_backed( - regions: &[(GuestAddress, usize)], +pub fn file_shared( + file: File, + regions: impl Iterator, track_dirty_pages: bool, huge_pages: HugePageConfig, ) -> Result, MemoryError> { - let size = regions.iter().map(|&(_, size)| size as u64).sum(); - let memfd_file = create_memfd(size, huge_pages.into())?.into_file(); - create( - regions.iter().copied(), + regions, libc::MAP_SHARED | huge_pages.mmap_flags(), - Some(memfd_file), + Some(file), track_dirty_pages, 0, ) @@ -245,7 +343,7 @@ pub fn anonymous( /// Creates a GuestMemoryMmap given a `file` containing the data /// and a `state` containing mapping information. -pub fn snapshot_file( +pub fn file_private( file: File, regions: impl Iterator, track_dirty_pages: bool, @@ -438,7 +536,8 @@ impl GuestMemoryExtension for GuestMemoryMmap { } } -fn create_memfd( +/// Creates a memfd of the given size and huge pages configuration +pub fn create_memfd( mem_size: u64, hugetlb_size: Option, ) -> Result { @@ -475,6 +574,7 @@ mod tests { use std::collections::HashMap; use std::io::{Read, Seek}; + use itertools::Itertools; use vmm_sys_util::tempfile::TempFile; use super::*; @@ -486,7 +586,7 @@ mod tests { regions .into_iter() .zip(0u32..) // assign dummy slots - .map(|(region, slot)| KvmRegion::from_mmap_region(region, slot)) + .map(|(region, slot)| KvmRegion::from_mmap_region(region, slot, None)) .collect(), ) .unwrap() @@ -691,7 +791,7 @@ mod tests { guest_memory.dump(&mut memory_file).unwrap(); let restored_guest_memory = - kvmify(snapshot_file(memory_file, memory_state.regions(), false, 0).unwrap()); + kvmify(file_private(memory_file, memory_state.regions(), false, 0).unwrap()); // Check that the region contents are the same. let mut restored_region = vec![0u8; page_size * 2]; @@ -749,7 +849,7 @@ mod tests { // We can restore from this because this is the first dirty dump. let restored_guest_memory = - kvmify(snapshot_file(file, memory_state.regions(), false, 0).unwrap()); + kvmify(file_private(file, memory_state.regions(), false, 0).unwrap()); // Check that the region contents are the same. let mut restored_region = vec![0u8; region_size]; @@ -842,4 +942,35 @@ mod tests { seals.insert(memfd::FileSeal::SealGrow); memfd.add_seals(&seals).unwrap_err(); } + + #[test] + fn test_bounce() { + let file_direct = TempFile::new().unwrap(); + let file_bounced = TempFile::new().unwrap(); + + let mut data = (0..=255).collect_vec(); + + Bounce(file_direct.as_file(), false) + .write_all_volatile(&VolatileSlice::from(data.as_mut_slice())) + .unwrap(); + Bounce(file_bounced.as_file(), true) + .write_all_volatile(&VolatileSlice::from(data.as_mut_slice())) + .unwrap(); + + let mut data_direct = vec![0u8; 256]; + let mut data_bounced = vec![0u8; 256]; + + file_direct.as_file().seek(SeekFrom::Start(0)).unwrap(); + file_bounced.as_file().seek(SeekFrom::Start(0)).unwrap(); + + Bounce(file_direct.as_file(), false) + .read_exact_volatile(&mut VolatileSlice::from(data_direct.as_mut_slice())) + .unwrap(); + Bounce(file_bounced.as_file(), true) + .read_exact_volatile(&mut VolatileSlice::from(data_bounced.as_mut_slice())) + .unwrap(); + + assert_eq!(data_direct, data_bounced); + assert_eq!(data_direct, data); + } } diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 1f917cfa0c7..dfce69ce5c6 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -6,27 +6,35 @@ // found in the THIRD-PARTY file. use std::collections::HashMap; -use std::fs::OpenOptions; +use std::fs::{File, OpenOptions}; use std::io::Write; +use std::os::fd::FromRawFd; use std::path::Path; use std::sync::Arc; -use kvm_ioctls::VmFd; +use kvm_bindings::{ + KVM_MEMORY_ATTRIBUTE_PRIVATE, kvm_create_guest_memfd, kvm_memory_attributes, + kvm_userspace_memory_region, +}; +use kvm_ioctls::{Cap, VmFd}; use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; use vmm_sys_util::eventfd::EventFd; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; +use crate::arch::{Kvm, host_page_size}; use crate::logger::info; use crate::persist::{CreateSnapshotError, GuestRegionUffdMapping}; use crate::utils::u64_to_usize; use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::memory::{ - GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, + Bounce, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, KvmRegion, }; use crate::vstate::vcpu::VcpuError; use crate::{DirtyBitmap, Vcpu, mem_size_mib}; +pub(crate) const KVM_GMEM_NO_DIRECT_MAP: u64 = 1; + /// Architecture independent parts of a VM. #[derive(Debug)] pub struct VmCommon { @@ -58,12 +66,18 @@ pub enum VmError { NotEnoughMemorySlots, /// Memory Error: {0} VmMemory(#[from] vm_memory::Error), + /// Failure to create guest_memfd: {0} + GuestMemfd(kvm_ioctls::Error), + /// guest_memfd is not supported on this host kernel. + GuestMemfdNotSupported, + /// Failed to set memory attributes to private: {0} + SetMemoryAttributes(kvm_ioctls::Error), } /// Contains Vm functions that are usable across CPU architectures impl Vm { /// Create a KVM VM - pub fn create_common(kvm: &crate::vstate::kvm::Kvm) -> Result { + pub fn create_common(kvm: &Kvm, vm_type: Option) -> Result { // It is known that KVM_CREATE_VM occasionally fails with EINTR on heavily loaded machines // with many VMs. // @@ -87,7 +101,12 @@ impl Vm { const MAX_ATTEMPTS: u32 = 5; let mut attempt = 1; let fd = loop { - match kvm.fd.create_vm() { + let vm_res = match vm_type { + Some(r#type) => kvm.fd.create_vm_with_type(r#type), + None => kvm.fd.create_vm(), + }; + + match vm_res { Ok(fd) => break fd, Err(e) if e.errno() == libc::EINTR && attempt < MAX_ATTEMPTS => { info!("Attempt #{attempt} of KVM_CREATE_VM returned EINTR"); @@ -128,19 +147,50 @@ impl Vm { Ok((vcpus, exit_evt)) } + /// Create a guest_memfd of the specified size + pub fn create_guest_memfd(&self, size: usize, flags: u64) -> Result { + assert_eq!( + size & (host_page_size() - 1), + 0, + "guest_memfd size must be page aligned" + ); + + if !self.fd().check_extension(Cap::GuestMemfd) { + return Err(VmError::GuestMemfdNotSupported); + } + + let kvm_gmem = kvm_create_guest_memfd { + size: size as u64, + flags, + ..Default::default() + }; + + self.fd() + .create_guest_memfd(kvm_gmem) + .map_err(VmError::GuestMemfd) + // SAFETY: We know rawfd is a valid fd because create_guest_memfd didn't return an + // error. + .map(|rawfd| unsafe { File::from_raw_fd(rawfd) }) + } + /// Register a list of new memory regions to this [`Vm`]. pub fn register_memory_regions( &mut self, regions: Vec, + mmap_of_guest_memfd: bool, ) -> Result<(), VmError> { for region in regions { - self.register_memory_region(region)? + self.register_memory_region(region, mmap_of_guest_memfd)? } Ok(()) } - fn kvmify_region(&self, region: GuestRegionMmap) -> Result { + fn kvmify_region( + &self, + region: GuestRegionMmap, + mmap_of_guest_memfd: bool, + ) -> Result { let next_slot = self .guest_memory() .num_regions() @@ -153,23 +203,59 @@ impl Vm { return Err(VmError::NotEnoughMemorySlots); } - Ok(KvmRegion::from_mmap_region(region, next_slot)) + let gmem_fo = if mmap_of_guest_memfd { + assert!( + region.file_offset().is_some(), + "Requested to register guest_memfd to region that isn't mapping a guest_memfd in \ + the first place!" + ); + + region.file_offset().cloned() + } else { + None + }; + + Ok(KvmRegion::from_mmap_region(region, next_slot, gmem_fo)) } fn register_kvm_region(&mut self, region: &KvmRegion) -> Result<(), VmError> { - // SAFETY: Safe because the fd is a valid KVM file descriptor. - unsafe { - self.fd() - .set_user_memory_region(*region.inner()) - .map_err(VmError::SetUserMemoryRegion)?; + if self.fd().check_extension(Cap::UserMemory2) { + // SAFETY: Safe because the fd is a valid KVM file descriptor. + unsafe { + self.fd() + .set_user_memory_region2(*region.inner()) + .map_err(VmError::SetUserMemoryRegion)?; + } + } else { + // Something is seriously wrong if we manage to set these fields on a host that doesn't + // even allow creation of guest_memfds! + assert_eq!(region.inner().guest_memfd, 0); + assert_eq!(region.inner().guest_memfd_offset, 0); + + // SAFETY: We are passing a valid memory region and operate on a valid KVM FD. + unsafe { + self.fd() + .set_user_memory_region(kvm_userspace_memory_region { + slot: region.inner().slot, + flags: region.inner().flags, + guest_phys_addr: region.inner().guest_phys_addr, + memory_size: region.inner().memory_size, + userspace_addr: region.inner().userspace_addr, + }) + .map_err(VmError::SetUserMemoryRegion)?; + } } Ok(()) } /// Register a new memory region to this [`Vm`]. - pub fn register_memory_region(&mut self, region: GuestRegionMmap) -> Result<(), VmError> { - let arcd_region = Arc::new(self.kvmify_region(region)?); + pub fn register_memory_region( + &mut self, + region: GuestRegionMmap, + mmap_of_guest_memfd: bool, + ) -> Result<(), VmError> { + let arcd_region = Arc::new(self.kvmify_region(region, mmap_of_guest_memfd)?); let new_guest_memory = self .common .guest_memory @@ -183,7 +269,9 @@ impl Vm { /// Registers a new io memory region to this [`Vm`]. pub fn register_swiotlb_region(&mut self, region: GuestRegionMmap) -> Result<(), VmError> { - let arcd_region = Arc::new(self.kvmify_region(region)?); + // swiotlb regions are never gmem backed - by definition they need to be accessible to the + // host! + let arcd_region = Arc::new(self.kvmify_region(region, false)?); let new_collection = self .common .swiotlb_regions @@ -229,6 +317,26 @@ impl Vm { self.common.swiotlb_regions.num_regions() > 0 } + /// Sets the memory attributes on all guest_memfd-backed regions to private + pub fn set_memory_private(&self) -> Result<(), VmError> { + for region in self.guest_memory().iter() { + if region.inner().guest_memfd != 0 { + let attr = kvm_memory_attributes { + address: region.start_addr().0, + size: region.len(), + attributes: KVM_MEMORY_ATTRIBUTE_PRIVATE as u64, + ..Default::default() + }; + + self.fd() + .set_memory_attributes(attr) + .map_err(VmError::SetMemoryAttributes)? + } + } + + Ok(()) + } + /// Returns an iterator over all regions, normal and swiotlb. fn all_regions(&self) -> impl Iterator { self.guest_memory() @@ -353,8 +461,12 @@ impl Vm { .and_then(|_| self.swiotlb_regions().dump_dirty(&mut file, &dirty_bitmap))?; } SnapshotType::Full => { + let secret_hidden = self + .guest_memory() + .iter() + .any(|r| r.inner().guest_memfd != 0); self.guest_memory() - .dump(&mut file) + .dump(&mut Bounce(&file, secret_hidden)) .and_then(|_| self.swiotlb_regions().dump(&mut file))?; self.reset_dirty_bitmap(); self.guest_memory().reset_dirty(); @@ -384,7 +496,7 @@ pub(crate) mod tests { // Auxiliary function being used throughout the tests. pub(crate) fn setup_vm() -> (Kvm, Vm) { let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - let vm = Vm::new(&kvm).expect("Cannot create new vm"); + let vm = Vm::new(&kvm, None).expect("Cannot create new vm"); (kvm, vm) } @@ -392,7 +504,7 @@ pub(crate) mod tests { pub(crate) fn setup_vm_with_memory(mem_size: usize) -> (Kvm, Vm) { let (kvm, mut vm) = setup_vm(); let gm = single_region_mem_raw(mem_size); - vm.register_memory_regions(gm).unwrap(); + vm.register_memory_regions(gm, false).unwrap(); (kvm, vm) } @@ -400,7 +512,7 @@ pub(crate) mod tests { fn test_new() { // Testing with a valid /dev/kvm descriptor. let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - Vm::new(&kvm).unwrap(); + Vm::new(&kvm, None).unwrap(); } #[test] @@ -410,14 +522,14 @@ pub(crate) mod tests { // Trying to set a memory region with a size that is not a multiple of GUEST_PAGE_SIZE // will result in error. let gm = single_region_mem_raw(0x10); - let res = vm.register_memory_regions(gm); + let res = vm.register_memory_regions(gm, false); assert_eq!( res.unwrap_err().to_string(), "Cannot set the memory regions: Invalid argument (os error 22)" ); let gm = single_region_mem_raw(0x1000); - let res = vm.register_memory_regions(gm); + let res = vm.register_memory_regions(gm, false); res.unwrap(); } @@ -452,7 +564,7 @@ pub(crate) mod tests { let region = GuestRegionMmap::new(region, GuestAddress(i as u64 * 0x1000)).unwrap(); - let res = vm.register_memory_region(region); + let res = vm.register_memory_region(region, false); if i >= max_nr_regions { assert!( @@ -495,8 +607,8 @@ pub(crate) mod tests { let mut regions = memory::anonymous(regions.into_iter(), false, HugePageConfig::None).unwrap(); - vm.register_memory_region(regions.remove(0)).unwrap(); - vm.register_memory_region(regions.remove(0)).unwrap(); + vm.register_memory_region(regions.remove(0), false).unwrap(); + vm.register_memory_region(regions.remove(0), false).unwrap(); // Before we register any swiotlb regions, io_memory() should return the normal mem region assert_eq!(vm.guest_memory().num_regions(), 2); diff --git a/tests/conftest.py b/tests/conftest.py index 9daeaa42ef3..8dc03828764 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -381,12 +381,20 @@ def io_engine(request): # the network throughput test occasionally fail (both due to connection issues). # The block test passes even with the minimum of 1MiB. We pick 8 to have enough # buffer to the failing cases. -@pytest.fixture(params=[None, 8] if platform.machine() == "aarch64" else [None]) +secret_free_test_cases = [None] +if platform.machine() == "aarch64": + secret_free_test_cases.append((8, False)) + if global_props.instance != "m6g.metal": + secret_free_test_cases.append((8, True)) + + +@pytest.fixture(params=secret_free_test_cases) def memory_config(request): """Differently configured swiotlb regions. Only supported on aarch64""" if request.param is None: return None - return {"initial_swiotlb_size": request.param} + swiotlb_size, secret_free = request.param + return {"initial_swiotlb_size": swiotlb_size, "secret_free": secret_free} @pytest.fixture diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index eb2cd1b608e..09aba9bc968 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -374,9 +374,7 @@ def test_api_machine_config(uvm_plain): bad_size = (1 << 64) - 1 test_microvm.api.machine_config.patch(mem_size_mib=bad_size) - fail_msg = re.escape( - "Invalid Memory Configuration: Cannot create mmap region: Out of memory (os error 12)" - ) + fail_msg = re.escape("Out of memory (os error 12)") with pytest.raises(RuntimeError, match=fail_msg): test_microvm.start() diff --git a/tests/integration_tests/functional/test_secret_freedom.py b/tests/integration_tests/functional/test_secret_freedom.py new file mode 100644 index 00000000000..520cb381203 --- /dev/null +++ b/tests/integration_tests/functional/test_secret_freedom.py @@ -0,0 +1,86 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Test secret-freedom related functionality.""" + +import platform + +import pytest + +from framework import defs +from framework.microvm import Serial +from framework.properties import global_props +from integration_tests.performance.test_initrd import INITRD_FILESYSTEM + +pytestmark = [ + pytest.mark.skipif( + global_props.host_linux_version_metrics != "next", + reason="Secret Freedom is only supported on the in-dev upstream kernels for now", + ), + pytest.mark.skipif( + global_props.instance == "m6g.metal", + reason="Secret Freedom currently only works on ARM hardware conforming to at least ARMv8.4 as absense of ARM64_HAS_STAGE2_FWB causes kernel panics because of dcache flushing during stage2 page table entry installation", + ), +] + + +@pytest.mark.skipif( + platform.machine() != "aarch64", + reason="only ARM can boot secret free VMs with I/O devices", +) +def test_secret_free_boot(microvm_factory, guest_kernel_linux_6_1, rootfs): + """Tests that a VM can boot if all virtio devices are bound to a swiotlb region, and + that this swiotlb region is actually discovered by the guest.""" + vm = microvm_factory.build(guest_kernel_linux_6_1, rootfs) + vm.spawn() + vm.memory_monitor = None + vm.basic_config(memory_config={"initial_swiotlb_size": 8, "secret_free": True}) + vm.add_net_iface() + vm.start() + + +def test_secret_free_initrd(microvm_factory, guest_kernel): + """ + Test that we can boot a secret hidden initrd (e.g. a VM with no I/O devices) + """ + fs = defs.ARTIFACT_DIR / "initramfs.cpio" + uvm = microvm_factory.build(guest_kernel) + uvm.initrd_file = fs + uvm.help.enable_console() + uvm.spawn() + uvm.memory_monitor = None + + uvm.basic_config( + add_root_device=False, + vcpu_count=1, + boot_args="console=ttyS0 reboot=k panic=1 pci=off no-kvmclock", + use_initrd=True, + memory_config={"initial_swiotlb_size": 64, "secret_free": True}, + ) + + uvm.start() + serial = Serial(uvm) + serial.open() + serial.rx(token="# ") + serial.tx("mount |grep rootfs") + serial.rx(token=f"rootfs on / type {INITRD_FILESYSTEM}") + + +@pytest.mark.skipif( + platform.machine() != "aarch64", + reason="only ARM can boot secret free VMs with I/O devices", +) +def test_secret_free_snapshot_creation(microvm_factory, guest_kernel_linux_6_1, rootfs): + """Test that snapshot creation works for secret hidden VMs""" + vm = microvm_factory.build(guest_kernel_linux_6_1, rootfs) + vm.spawn() + vm.memory_monitor = None + vm.basic_config(memory_config={"initial_swiotlb_size": 8, "secret_free": True}) + vm.add_net_iface() + vm.start() + + snapshot = vm.snapshot_full() + + # After restoration, the VM will not be secret hidden anymore, as that's not supported yet. + # But we can at least test that in principle, the snapshot creation worked. + vm = microvm_factory.build_from_snapshot(snapshot) + vm.ssh.check_output("true") diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index f035ef38c5a..bd2eb3ba9bc 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -79,7 +79,7 @@ def test_boottime( pytest.skip("swiotlb only supported on aarch64/6.1") for _ in range(10): - vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) + vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw, monitor_memory=False) vm.jailer.extra_args.update({"boot-timer": None}) vm.spawn() vm.basic_config( diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index d683afe065e..f5ddfe23786 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -255,7 +255,7 @@ def test_negative_huge_pages_plus_balloon(uvm_plain): uvm_plain.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB) with pytest.raises( RuntimeError, - match="Firecracker's huge pages support is incompatible with memory ballooning.", + match="Memory ballooning is incompatible with huge pages.", ): uvm_plain.api.balloon.put(amount_mib=0, deflate_on_oom=False) @@ -264,6 +264,6 @@ def test_negative_huge_pages_plus_balloon(uvm_plain): uvm_plain.api.balloon.put(amount_mib=0, deflate_on_oom=False) with pytest.raises( RuntimeError, - match="Machine config error: Firecracker's huge pages support is incompatible with memory ballooning.", + match="Machine config error: 'balloon device' and 'huge pages' are mutually exclusive and cannot be used together.", ): uvm_plain.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB)