diff --git a/CHANGELOG.md b/CHANGELOG.md index d6a8e1b2ded..30f5b70330c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,10 @@ and this project adheres to - [#5007](https://github.com/firecracker-microvm/firecracker/pull/5007): Fixed watchdog softlockup warning on x86_64 guests when a vCPU is paused during GDB debugging. +- [#5021](https://github.com/firecracker-microvm/firecracker/pull/5021) If a + balloon device is inflated post UFFD-backed snapshot restore, Firecracker now + causes `remove` UFFD messages to be sent to the UFFD handler. Previously, no + such message would be sent. ## [1.10.1] diff --git a/src/firecracker/examples/uffd/fault_all_handler.rs b/src/firecracker/examples/uffd/fault_all_handler.rs index 5e9f49a3207..cfeaa099236 100644 --- a/src/firecracker/examples/uffd/fault_all_handler.rs +++ b/src/firecracker/examples/uffd/fault_all_handler.rs @@ -36,7 +36,7 @@ fn main() { userfaultfd::Event::Pagefault { .. } => { for region in uffd_handler.mem_regions.clone() { uffd_handler - .serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size) + .serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size); } } _ => panic!("Unexpected event on userfaultfd"), diff --git a/src/firecracker/examples/uffd/uffd_utils.rs b/src/firecracker/examples/uffd/uffd_utils.rs index 8cc70ab7c21..a2f7879f591 100644 --- a/src/firecracker/examples/uffd/uffd_utils.rs +++ b/src/firecracker/examples/uffd/uffd_utils.rs @@ -116,7 +116,7 @@ impl UffdHandler { } } - pub fn serve_pf(&mut self, addr: *mut u8, len: usize) { + pub fn serve_pf(&mut self, addr: *mut u8, len: usize) -> bool { // Find the start of the page that the current faulting address belongs to. let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void; let fault_page_addr = dst as u64; @@ -133,14 +133,18 @@ impl UffdHandler { // event was received. This can be a consequence of guest reclaiming back its // memory from the host (through balloon device) Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => { - let (start, end) = self.populate_from_file(region, fault_page_addr, len); - self.update_mem_state_mappings(start, end, MemPageState::FromFile); - return; + match self.populate_from_file(region, fault_page_addr, len) { + Some((start, end)) => { + self.update_mem_state_mappings(start, end, MemPageState::FromFile) + } + None => return false, + } + return true; } Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => { let (start, end) = self.zero_out(fault_page_addr); self.update_mem_state_mappings(start, end, MemPageState::Anonymous); - return; + return true; } None => {} } @@ -152,20 +156,39 @@ impl UffdHandler { ); } - fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> (u64, u64) { + fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> Option<(u64, u64)> { let offset = dst - region.mapping.base_host_virt_addr; let src = self.backing_buffer as u64 + region.mapping.offset + offset; let ret = unsafe { - self.uffd - .copy(src as *const _, dst as *mut _, len, true) - .expect("Uffd copy failed") + match self.uffd.copy(src as *const _, dst as *mut _, len, true) { + Ok(value) => value, + // Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD + // queue while we're processing `pagefault` events. + // The weird cast is because the `bytes_copied` field is based on the + // `uffdio_copy->copy` field, which is a signed 64 bit integer, and if something + // goes wrong, it gets set to a -errno code. However, uffd-rs always casts this + // value to an unsigned `usize`, which scrambled the errno. + Err(Error::PartiallyCopied(bytes_copied)) + if bytes_copied == 0 || bytes_copied == (-libc::EAGAIN) as usize => + { + return None + } + Err(Error::CopyFailed(errno)) + if std::io::Error::from(errno).raw_os_error().unwrap() == libc::EEXIST => + { + len + } + Err(e) => { + panic!("Uffd copy failed: {e:?}"); + } + } }; // Make sure the UFFD copied some bytes. assert!(ret > 0); - (dst, dst + len as u64) + Some((dst, dst + len as u64)) } fn zero_out(&mut self, addr: u64) -> (u64, u64) { diff --git a/src/firecracker/examples/uffd/valid_handler.rs b/src/firecracker/examples/uffd/valid_handler.rs index 6c681d932ac..936b9f517a3 100644 --- a/src/firecracker/examples/uffd/valid_handler.rs +++ b/src/firecracker/examples/uffd/valid_handler.rs @@ -26,24 +26,79 @@ fn main() { let mut runtime = Runtime::new(stream, file); runtime.install_panic_hook(); runtime.run(|uffd_handler: &mut UffdHandler| { - // Read an event from the userfaultfd. - let event = uffd_handler - .read_event() - .expect("Failed to read uffd_msg") - .expect("uffd_msg not ready"); - - // We expect to receive either a Page Fault or Removed - // event (if the balloon device is enabled). - match event { - userfaultfd::Event::Pagefault { addr, .. } => { - uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) + // !DISCLAIMER! + // When using UFFD together with the balloon device, this handler needs to deal with + // `remove` and `pagefault` events. There are multiple things to keep in mind in + // such setups: + // + // As long as any `remove` event is pending in the UFFD queue, all ioctls return EAGAIN + // ----------------------------------------------------------------------------------- + // + // This means we cannot process UFFD events simply one-by-one anymore - if a `remove` event + // arrives, we need to pre-fetch all other events up to the `remove` event, to unblock the + // UFFD, and then go back to the process the pre-fetched events. + // + // UFFD might receive events in not in their causal order + // ----------------------------------------------------- + // + // For example, the guest + // kernel might first respond to a balloon inflation by freeing some memory, and + // telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the + // free memory range, which causes a `remove` event to be sent to UFFD. Then, the + // guest kernel might immediately fault the page in again (for example because + // default_on_oom was set). which causes a `pagefault` event to be sent to UFFD. + // + // However, the pagefault will be triggered from inside KVM on the vCPU thread, while the + // balloon device is handled by Firecracker on its VMM thread. This means that potentially + // this handler can receive the `pagefault` _before_ the `remove` event. + // + // This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events + // to make sure no `remove` event is blocking us can result in the handler acting on + // the `pagefault` event before the `remove` message (despite the `remove` event being + // in the causal past of the `pagefault` event), which means that we will fault in a page + // from the snapshot file, while really we should be faulting in a zero page. + // + // In this example handler, we ignore this problem, to avoid + // complexity (under the assumption that the guest kernel will zero a newly faulted in + // page anyway). A production handler will most likely want to ensure that `remove` + // events for a specific range are always handled before `pagefault` events. + // + // Lastly, we still need to deal with the race condition where a `remove` event arrives + // in the UFFD queue after we got done reading all events, in which case we need to go + // back to reading more events before we can continue processing `pagefault`s. + let mut deferred_events = Vec::new(); + + loop { + // First, try events that we couldn't handle last round + let mut events_to_handle = Vec::from_iter(deferred_events.drain(..)); + + // Read all events from the userfaultfd. + while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg") { + events_to_handle.push(event); + } + + for event in events_to_handle.drain(..) { + // We expect to receive either a Page Fault or `remove` + // event (if the balloon device is enabled). + match event { + userfaultfd::Event::Pagefault { addr, .. } => { + if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) { + deferred_events.push(event); + } + } + userfaultfd::Event::Remove { start, end } => uffd_handler + .update_mem_state_mappings(start as u64, end as u64, MemPageState::Removed), + _ => panic!("Unexpected event on userfaultfd"), + } + } + + // We assume that really only the above removed/pagefault interaction can result in + // deferred events. In that scenario, the loop will always terminate (unless + // newly arriving `remove` events end up indefinitely blocking it, but there's nothing + // we can do about that, and it's a largely theoretical problem). + if deferred_events.is_empty() { + break; } - userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings( - start as u64, - end as u64, - MemPageState::Removed, - ), - _ => panic!("Unexpected event on userfaultfd"), } }); } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 4b43e67541f..fc23d8add0b 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -531,6 +531,7 @@ pub fn build_microvm_from_snapshot( resource_allocator: &mut vmm.resource_allocator, vm_resources, instance_id: &instance_info.id, + restored_from_file: vmm.uffd.is_none(), }; vmm.mmio_device_manager = diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 5773fa0ba09..bdf63409d68 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -214,6 +214,7 @@ pub struct MMIODevManagerConstructorArgs<'a> { pub resource_allocator: &'a mut ResourceAllocator, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, + pub restored_from_file: bool, } impl fmt::Debug for MMIODevManagerConstructorArgs<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -512,7 +513,10 @@ impl<'a> Persist<'a> for MMIODeviceManager { if let Some(balloon_state) = &state.balloon_device { let device = Arc::new(Mutex::new(Balloon::restore( - BalloonConstructorArgs { mem: mem.clone() }, + BalloonConstructorArgs { + mem: mem.clone(), + restored_from_file: constructor_args.restored_from_file, + }, &balloon_state.device_state, )?)); @@ -807,6 +811,7 @@ mod tests { resource_allocator: &mut resource_allocator, vm_resources, instance_id: "microvm-id", + restored_from_file: true, }; let restored_dev_manager = MMIODeviceManager::restore(restore_args, &device_states).unwrap(); diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 697928ae9c6..f6be2536de5 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -164,7 +164,7 @@ pub struct Balloon { pub(crate) irq_trigger: IrqTrigger, // Implementation specific fields. - pub(crate) restored: bool, + pub(crate) restored_from_file: bool, pub(crate) stats_polling_interval_s: u16, pub(crate) stats_timer: TimerFd, // The index of the previous stats descriptor is saved because @@ -189,7 +189,7 @@ impl fmt::Debug for Balloon { .field("queue_evts", &self.queue_evts) .field("device_state", &self.device_state) .field("irq_trigger", &self.irq_trigger) - .field("restored", &self.restored) + .field("restored_from_file", &self.restored_from_file) .field("stats_polling_interval_s", &self.stats_polling_interval_s) .field("stats_desc_index", &self.stats_desc_index) .field("latest_stats", &self.latest_stats) @@ -204,7 +204,7 @@ impl Balloon { amount_mib: u32, deflate_on_oom: bool, stats_polling_interval_s: u16, - restored: bool, + restored_from_file: bool, ) -> Result { let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; @@ -245,7 +245,7 @@ impl Balloon { irq_trigger: IrqTrigger::new().map_err(BalloonError::EventFd)?, device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, - restored, + restored_from_file, stats_polling_interval_s, stats_timer, stats_desc_index: None, @@ -355,7 +355,7 @@ impl Balloon { if let Err(err) = remove_range( mem, (guest_addr, u64::from(range_len) << VIRTIO_BALLOON_PFN_SHIFT), - self.restored, + self.restored_from_file, ) { error!("Error removing memory range: {:?}", err); } diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index 4e768ddd2e2..c1fb6865b5f 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -95,6 +95,7 @@ pub struct BalloonState { pub struct BalloonConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, + pub restored_from_file: bool, } impl Persist<'_> for Balloon { @@ -121,7 +122,12 @@ impl Persist<'_> for Balloon { ) -> Result { // We can safely create the balloon with arbitrary flags and // num_pages because we will overwrite them after. - let mut balloon = Balloon::new(0, false, state.stats_polling_interval_s, true)?; + let mut balloon = Balloon::new( + 0, + false, + state.stats_polling_interval_s, + constructor_args.restored_from_file, + )?; let mut num_queues = BALLOON_NUM_QUEUES; // As per the virtio 1.1 specification, the statistics queue @@ -192,13 +198,16 @@ mod tests { // Deserialize and restore the balloon device. let restored_balloon = Balloon::restore( - BalloonConstructorArgs { mem: guest_mem }, + BalloonConstructorArgs { + mem: guest_mem, + restored_from_file: true, + }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); assert_eq!(restored_balloon.device_type(), TYPE_BALLOON); - assert!(restored_balloon.restored); + assert!(restored_balloon.restored_from_file); assert_eq!(restored_balloon.acked_features, balloon.acked_features); assert_eq!(restored_balloon.avail_features, balloon.avail_features); diff --git a/src/vmm/src/devices/virtio/balloon/util.rs b/src/vmm/src/devices/virtio/balloon/util.rs index f8cf7aa2000..a9960540a60 100644 --- a/src/vmm/src/devices/virtio/balloon/util.rs +++ b/src/vmm/src/devices/virtio/balloon/util.rs @@ -68,7 +68,7 @@ pub(crate) fn compact_page_frame_numbers(v: &mut [u32]) -> Vec<(u32, u32)> { pub(crate) fn remove_range( guest_memory: &GuestMemoryMmap, range: (GuestAddress, u64), - restored: bool, + restored_from_file: bool, ) -> Result<(), RemoveRegionError> { let (guest_address, range_len) = range; @@ -83,7 +83,7 @@ pub(crate) fn remove_range( // Mmap a new anonymous region over the present one in order to create a hole. // This workaround is (only) needed after resuming from a snapshot because the guest memory // is mmaped from file as private and there is no `madvise` flag that works for this case. - if restored { + if restored_from_file { // SAFETY: The address and length are known to be valid. let ret = unsafe { libc::mmap( diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 6833a3a12d2..618f5d7b6c3 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -314,8 +314,6 @@ pub struct Vmm { vm: Vm, guest_memory: GuestMemoryMmap, // Save UFFD in order to keep it open in the Firecracker process, as well. - // Since this field is never read again, we need to allow `dead_code`. - #[allow(dead_code)] uffd: Option, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. diff --git a/tests/integration_tests/functional/test_uffd.py b/tests/integration_tests/functional/test_uffd.py index fbb86c4c987..398529de77d 100644 --- a/tests/integration_tests/functional/test_uffd.py +++ b/tests/integration_tests/functional/test_uffd.py @@ -118,10 +118,14 @@ def test_valid_handler(uvm_plain, snapshot, uffd_handler_paths): # Inflate balloon. vm.api.balloon.patch(amount_mib=200) + # Verify if the restored guest works. + vm.ssh.check_output("true") + # Deflate balloon. vm.api.balloon.patch(amount_mib=0) # Verify if the restored guest works. + vm.ssh.check_output("true") def test_malicious_handler(uvm_plain, snapshot, uffd_handler_paths):