diff --git a/src/firecracker/examples/uffd/fault_all_handler.rs b/src/firecracker/examples/uffd/fault_all_handler.rs index e85f7eba871..40a27b0ca43 100644 --- a/src/firecracker/examples/uffd/fault_all_handler.rs +++ b/src/firecracker/examples/uffd/fault_all_handler.rs @@ -90,27 +90,32 @@ fn fault_all(uffd_handler: &mut UffdHandler, fault_addr: *mut libc::c_void) { // first fault. Note that the kvm-clock page population does not trigger a UFFD // page fault events since it doesn't use the userspace mappings. // - // 3 cases are possible: - // 1) We wrote the entire region: this is when neither the kvmclock page nor - // the page that triggered the UFFD event was present in the region. This is - // only possible on x86 with the VMs larger than 3GiB. - // 2) We wrote the region but one page. This is expected to happen on ARM, - // where the only page missing is the one the caused the UFFD event. - // 3) We wrote the region but two pages: one for the kvmclock and one that - // caused the UFFD event. This can only happen on x86. + // 2 cases are possible: + // 1) We wrote the entire region: this is when the kvmclock page was not present + // in the region (for VMs larger than 3GiB due to the MMIO gap). + // 2) We wrote the region but one page, which is the kvmclock page. assert!( - written == region.size - || written == region.size - uffd_handler.page_size - || written == region.size - uffd_handler.page_size * 2 + written == region.size || written == region.size - uffd_handler.page_size ); + + if written < region.size { + // In the case 2 described above, we use UFFDIO_CONTINUE to set up userspace + // page tables that KVM uses to populate the shared kvmclock page. + _ = uffd_handler + .uffd + .r#continue(fault_addr, uffd_handler.page_size, true) + .inspect_err(|err| println!("Error during uffdio_continue: {:?}", err)); + } } #[cfg(target_arch = "aarch64")] { - assert_eq!(written, region.size - uffd_handler.page_size); - } - // We skip the serve_pf() call in case 1) described above - if written < region.size { - uffd_handler.serve_pf(fault_addr.cast(), uffd_handler.page_size); + assert_eq!(written, region.size); + // In the case 2 described above, we use UFFDIO_CONTINUE to set up userspace + // page tables that KVM uses to populate the shared kvmclock page. + _ = uffd_handler + .uffd + .r#continue(fault_addr, uffd_handler.page_size, true) + .inspect_err(|err| println!("Error during uffdio_continue: {:?}", err)); } } } diff --git a/src/firecracker/examples/uffd/uffd_utils.rs b/src/firecracker/examples/uffd/uffd_utils.rs index 577684521cd..7d3dd8767e7 100644 --- a/src/firecracker/examples/uffd/uffd_utils.rs +++ b/src/firecracker/examples/uffd/uffd_utils.rs @@ -288,7 +288,10 @@ impl UffdHandler { for region in self.mem_regions.iter() { if region.contains(fault_page_addr) { - return self.populate_from_file(®ion.clone(), fault_page_addr, len); + let offset = + (region.offset + fault_page_addr - region.base_host_virt_addr) as usize; + let src = unsafe { self.backing_buffer.add(offset) }; + return self.populate_via_uffdio_copy(src, fault_page_addr, len); } } @@ -378,11 +381,20 @@ impl UffdHandler { total_written } - fn populate_via_uffdio_copy(&self, src: *const u8, dst: u64, len: usize) -> bool { + fn populate_via_uffdio_copy(&mut self, src: *const u8, dst: u64, len: usize) -> bool { + // Calculate offset before the match to avoid borrow checker issues + let offset = self.addr_to_offset(dst as *mut u8) as usize; + unsafe { match self.uffd.copy(src.cast(), dst as *mut _, len, true) { // Make sure the UFFD copied some bytes. - Ok(value) => assert!(value > 0), + Ok(value) => { + assert!(value > 0); + // For secret-free VMs, clear the bit in userfault_bitmap after successful UFFDIO_COPY + if let Some(bitmap) = &mut self.userfault_bitmap { + bitmap.reset_addr_range(offset, len); + } + } // Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD // queue while we're processing `pagefault` events. // The weird cast is because the `bytes_copied` field is based on the @@ -405,44 +417,6 @@ impl UffdHandler { true } - fn populate_via_memcpy(&mut self, src: *const u8, dst: u64, offset: usize, len: usize) -> bool { - let dst_memcpy = unsafe { - self.guest_memfd_addr - .expect("no guest_memfd addr") - .add(offset) - }; - - unsafe { - std::ptr::copy_nonoverlapping(src, dst_memcpy, len); - } - - self.userfault_bitmap - .as_mut() - .unwrap() - .reset_addr_range(offset, len); - - self.uffd - .r#continue(dst as _, len, true) - .expect("uffd_continue"); - - true - } - - fn populate_from_file( - &mut self, - region: &GuestRegionUffdMapping, - dst: u64, - len: usize, - ) -> bool { - let offset = (region.offset + dst - region.base_host_virt_addr) as usize; - let src = unsafe { self.backing_buffer.add(offset) }; - - match self.guest_memfd { - Some(_) => self.populate_via_memcpy(src, dst, offset, len), - None => self.populate_via_uffdio_copy(src, dst, len), - } - } - fn zero_out(&mut self, addr: u64) -> bool { match unsafe { self.uffd.zeropage(addr as *mut _, self.page_size, true) } { Ok(_) => true, diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index fd5abcfad29..738695a84fc 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -583,9 +583,6 @@ pub enum GuestMemoryFromUffdError { HugetlbfsSnapshot, } -// TODO remove these when the UFFD crate supports minor faults for guest_memfd -const UFFDIO_REGISTER_MODE_MINOR: u64 = 1 << 2; - type GuestMemoryResult = Result<(Vec, Option, Option), GuestMemoryFromUffdError>; @@ -621,7 +618,8 @@ pub fn guest_memory_from_uffd( let mut fds = vec![uffd.as_raw_fd()]; if let Some(gmem) = guest_memfd_fd { - mode = RegisterMode::from_bits_retain(UFFDIO_REGISTER_MODE_MINOR); + // For secret-free VMs, register with both MISSING and MINOR modes + mode = RegisterMode::MISSING | RegisterMode::MINOR; fds.push(gmem); fds.push( userfault_bitmap_memfd diff --git a/tests/integration_tests/functional/test_topology.py b/tests/integration_tests/functional/test_topology.py index 8a57a349e1d..b5f690ad39b 100644 --- a/tests/integration_tests/functional/test_topology.py +++ b/tests/integration_tests/functional/test_topology.py @@ -217,6 +217,9 @@ def test_cpu_topology(uvm_plain_any, num_vcpus, htt): ) +@pytest.mark.skipif( + PLATFORM == "aarch64", reason="Skip until the hange in guest sysfs is addressed" +) @pytest.mark.parametrize("num_vcpus", [1, 2, 16]) @pytest.mark.parametrize("htt", [True, False], ids=["HTT_ON", "HTT_OFF"]) def test_cache_topology(uvm_plain_any, num_vcpus, htt):