Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 21 additions & 16 deletions src/firecracker/examples/uffd/fault_all_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,27 +90,32 @@ fn fault_all(uffd_handler: &mut UffdHandler, fault_addr: *mut libc::c_void) {
// first fault. Note that the kvm-clock page population does not trigger a UFFD
// page fault events since it doesn't use the userspace mappings.
//
// 3 cases are possible:
// 1) We wrote the entire region: this is when neither the kvmclock page nor
// the page that triggered the UFFD event was present in the region. This is
// only possible on x86 with the VMs larger than 3GiB.
// 2) We wrote the region but one page. This is expected to happen on ARM,
// where the only page missing is the one the caused the UFFD event.
// 3) We wrote the region but two pages: one for the kvmclock and one that
// caused the UFFD event. This can only happen on x86.
// 2 cases are possible:
// 1) We wrote the entire region: this is when the kvmclock page was not present
// in the region (for VMs larger than 3GiB due to the MMIO gap).
// 2) We wrote the region but one page, which is the kvmclock page.
assert!(
written == region.size
|| written == region.size - uffd_handler.page_size
|| written == region.size - uffd_handler.page_size * 2
written == region.size || written == region.size - uffd_handler.page_size
);

if written < region.size {
// In the case 2 described above, we use UFFDIO_CONTINUE to set up userspace
// page tables that KVM uses to populate the shared kvmclock page.
_ = uffd_handler
.uffd
.r#continue(fault_addr, uffd_handler.page_size, true)
.inspect_err(|err| println!("Error during uffdio_continue: {:?}", err));
}
}
#[cfg(target_arch = "aarch64")]
{
assert_eq!(written, region.size - uffd_handler.page_size);
}
// We skip the serve_pf() call in case 1) described above
if written < region.size {
uffd_handler.serve_pf(fault_addr.cast(), uffd_handler.page_size);
assert_eq!(written, region.size);
// In the case 2 described above, we use UFFDIO_CONTINUE to set up userspace
// page tables that KVM uses to populate the shared kvmclock page.
_ = uffd_handler
.uffd
.r#continue(fault_addr, uffd_handler.page_size, true)
.inspect_err(|err| println!("Error during uffdio_continue: {:?}", err));
}
}
}
Expand Down
56 changes: 15 additions & 41 deletions src/firecracker/examples/uffd/uffd_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,10 @@ impl UffdHandler {

for region in self.mem_regions.iter() {
if region.contains(fault_page_addr) {
return self.populate_from_file(&region.clone(), fault_page_addr, len);
let offset =
(region.offset + fault_page_addr - region.base_host_virt_addr) as usize;
let src = unsafe { self.backing_buffer.add(offset) };
return self.populate_via_uffdio_copy(src, fault_page_addr, len);
}
}

Expand Down Expand Up @@ -378,11 +381,20 @@ impl UffdHandler {
total_written
}

fn populate_via_uffdio_copy(&self, src: *const u8, dst: u64, len: usize) -> bool {
fn populate_via_uffdio_copy(&mut self, src: *const u8, dst: u64, len: usize) -> bool {
// Calculate offset before the match to avoid borrow checker issues
let offset = self.addr_to_offset(dst as *mut u8) as usize;

unsafe {
match self.uffd.copy(src.cast(), dst as *mut _, len, true) {
// Make sure the UFFD copied some bytes.
Ok(value) => assert!(value > 0),
Ok(value) => {
assert!(value > 0);
// For secret-free VMs, clear the bit in userfault_bitmap after successful UFFDIO_COPY
if let Some(bitmap) = &mut self.userfault_bitmap {
bitmap.reset_addr_range(offset, len);
}
}
// Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD
// queue while we're processing `pagefault` events.
// The weird cast is because the `bytes_copied` field is based on the
Expand All @@ -405,44 +417,6 @@ impl UffdHandler {
true
}

fn populate_via_memcpy(&mut self, src: *const u8, dst: u64, offset: usize, len: usize) -> bool {
let dst_memcpy = unsafe {
self.guest_memfd_addr
.expect("no guest_memfd addr")
.add(offset)
};

unsafe {
std::ptr::copy_nonoverlapping(src, dst_memcpy, len);
}

self.userfault_bitmap
.as_mut()
.unwrap()
.reset_addr_range(offset, len);

self.uffd
.r#continue(dst as _, len, true)
.expect("uffd_continue");

true
}

fn populate_from_file(
&mut self,
region: &GuestRegionUffdMapping,
dst: u64,
len: usize,
) -> bool {
let offset = (region.offset + dst - region.base_host_virt_addr) as usize;
let src = unsafe { self.backing_buffer.add(offset) };

match self.guest_memfd {
Some(_) => self.populate_via_memcpy(src, dst, offset, len),
None => self.populate_via_uffdio_copy(src, dst, len),
}
}

fn zero_out(&mut self, addr: u64) -> bool {
match unsafe { self.uffd.zeropage(addr as *mut _, self.page_size, true) } {
Ok(_) => true,
Expand Down
6 changes: 2 additions & 4 deletions src/vmm/src/persist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -583,9 +583,6 @@ pub enum GuestMemoryFromUffdError {
HugetlbfsSnapshot,
}

// TODO remove these when the UFFD crate supports minor faults for guest_memfd
const UFFDIO_REGISTER_MODE_MINOR: u64 = 1 << 2;

type GuestMemoryResult =
Result<(Vec<GuestRegionMmap>, Option<Uffd>, Option<UnixStream>), GuestMemoryFromUffdError>;

Expand Down Expand Up @@ -621,7 +618,8 @@ pub fn guest_memory_from_uffd(
let mut fds = vec![uffd.as_raw_fd()];

if let Some(gmem) = guest_memfd_fd {
mode = RegisterMode::from_bits_retain(UFFDIO_REGISTER_MODE_MINOR);
// For secret-free VMs, register with both MISSING and MINOR modes
mode = RegisterMode::MISSING | RegisterMode::MINOR;
fds.push(gmem);
fds.push(
userfault_bitmap_memfd
Expand Down
3 changes: 3 additions & 0 deletions tests/integration_tests/functional/test_topology.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,9 @@ def test_cpu_topology(uvm_plain_any, num_vcpus, htt):
)


@pytest.mark.skipif(
PLATFORM == "aarch64", reason="Skip until the hange in guest sysfs is addressed"
)
@pytest.mark.parametrize("num_vcpus", [1, 2, 16])
@pytest.mark.parametrize("htt", [True, False], ids=["HTT_ON", "HTT_OFF"])
def test_cache_topology(uvm_plain_any, num_vcpus, htt):
Expand Down