Skip to content

Commit fef05f7

Browse files
committed
test(uffd_utils): add handling for FaultRequest in secret freedom
There are two ways a UFFD handler receives a fault notification if Secret Fredom is enabled (which is inferred from 3 fds sent by Firecracker instead of 1): - a VMM- or KVM-triggered fault is delivered via a minor UFFD fault event. The handler is supposed to respond to it via memcpying the content of the page (if the page hasn't already been populated) followed by a UFFDIO_CONTINUE call. - a vCPU-triggered fault is delievered via a FaultRequest message on the UDS socket. The handler is supposed to reply with a pwrite64 call on the guest_memfd to populate the page followed by a FaultReply message on the UDS socket. In both cases, the handler also needs to clear the bit in the userfault bitmap at the corresponding offset in order to stop further fault notifications for the same page. UFFD handlers use the userfault bitmap for two purposes: - communicate to the kernel whether a fault at the corresponding guest_memfd offset will cause a VM exit - keep track of pages that have already been populated in order to avoid overwriting the content of the page that is already initialised. Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent b9429cb commit fef05f7

File tree

3 files changed

+250
-19
lines changed

3 files changed

+250
-19
lines changed

src/firecracker/examples/uffd/fault_all_handler.rs

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@
88
mod uffd_utils;
99

1010
use std::fs::File;
11+
use std::os::fd::AsRawFd;
1112
use std::os::unix::net::UnixListener;
1213

1314
use uffd_utils::{Runtime, UffdHandler};
1415
use utils::time::{ClockType, get_time_us};
1516

17+
use crate::uffd_utils::uffd_continue;
18+
1619
fn main() {
1720
let mut args = std::env::args();
1821
let uffd_sock_path = args.nth(1).expect("No socket path given");
@@ -37,19 +40,69 @@ fn main() {
3740
.expect("Failed to read uffd_msg")
3841
.expect("uffd_msg not ready");
3942

40-
match event {
41-
userfaultfd::Event::Pagefault { .. } => {
42-
let start = get_time_us(ClockType::Monotonic);
43-
for region in uffd_handler.mem_regions.clone() {
44-
uffd_handler.serve_pf(region.base_host_virt_addr as _, region.size);
45-
}
46-
let end = get_time_us(ClockType::Monotonic);
43+
if let userfaultfd::Event::Pagefault { addr, .. } = event {
44+
let bit =
45+
uffd_handler.addr_to_offset(addr.cast()) as usize / uffd_handler.page_size;
46+
47+
// If Secret Free, we know if this is the first fault based on the userfault
48+
// bitmap state. Otherwise, we assume that we will ever only receive a single fault
49+
// event via UFFD.
50+
let are_we_faulted_yet = uffd_handler
51+
.userfault_bitmap
52+
.as_mut()
53+
.map_or(false, |bitmap| !bitmap.is_bit_set(bit));
4754

48-
println!("Finished Faulting All: {}us", end - start);
55+
if are_we_faulted_yet {
56+
// TODO: we currently ignore the result as we may attempt to
57+
// populate the page that is already present as we may receive
58+
// multiple minor fault events per page.
59+
let _ = uffd_continue(
60+
uffd_handler.uffd.as_raw_fd(),
61+
addr as _,
62+
uffd_handler.page_size as u64,
63+
)
64+
.inspect_err(|err| println!("Error during uffdio_continue: {:?}", err));
65+
} else {
66+
fault_all(uffd_handler, addr);
4967
}
50-
_ => panic!("Unexpected event on userfaultfd"),
5168
}
5269
},
5370
|_uffd_handler: &mut UffdHandler, _offset: usize| {},
5471
);
5572
}
73+
74+
fn fault_all(uffd_handler: &mut UffdHandler, fault_addr: *mut libc::c_void) {
75+
let start = get_time_us(ClockType::Monotonic);
76+
for region in uffd_handler.mem_regions.clone() {
77+
match uffd_handler.guest_memfd {
78+
None => {
79+
uffd_handler.serve_pf(region.base_host_virt_addr as _, region.size);
80+
}
81+
Some(_) => {
82+
let written = uffd_handler.populate_via_write(region.offset as usize, region.size);
83+
84+
// This code is written under the assumption that the first fault triggered by
85+
// Firecracker is either due to an MSR write (on x86) or due to device restoration
86+
// reading from guest memory to check the virtio queues are sane (on
87+
// ARM). This will be reported via a UFFD minor fault which needs to
88+
// be handled via memcpy. Importantly, we get to the UFFD handler
89+
// with the actual guest_memfd page already faulted in, meaning pwrite will stop
90+
// once it gets to the offset of that page (e.g. written < region.size above).
91+
// Thus, to fault in everything, we now need to skip this one page, write the
92+
// remaining region, and then deal with the "gap" via uffd_handler.serve_pf().
93+
94+
if written < region.size - uffd_handler.page_size {
95+
let r = uffd_handler.populate_via_write(
96+
region.offset as usize + written + uffd_handler.page_size,
97+
region.size - written - uffd_handler.page_size,
98+
);
99+
assert_eq!(written + r, region.size - uffd_handler.page_size);
100+
}
101+
}
102+
}
103+
}
104+
uffd_handler.serve_pf(fault_addr.cast(), uffd_handler.page_size);
105+
let end = get_time_us(ClockType::Monotonic);
106+
107+
println!("Finished Faulting All: {}us", end - start);
108+
}

src/firecracker/examples/uffd/on_demand_handler.rs

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,13 @@
88
mod uffd_utils;
99

1010
use std::fs::File;
11+
use std::os::fd::AsRawFd;
1112
use std::os::unix::net::UnixListener;
1213

1314
use uffd_utils::{Runtime, UffdHandler};
1415

16+
use crate::uffd_utils::uffd_continue;
17+
1518
fn main() {
1619
let mut args = std::env::args();
1720
let uffd_sock_path = args.nth(1).expect("No socket path given");
@@ -90,8 +93,36 @@ fn main() {
9093
// event (if the balloon device is enabled).
9194
match event {
9295
userfaultfd::Event::Pagefault { addr, .. } => {
93-
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
94-
deferred_events.push(event);
96+
let bit = uffd_handler.addr_to_offset(addr.cast()) as usize
97+
/ uffd_handler.page_size;
98+
99+
if uffd_handler.userfault_bitmap.is_some() {
100+
if uffd_handler
101+
.userfault_bitmap
102+
.as_mut()
103+
.unwrap()
104+
.is_bit_set(bit)
105+
{
106+
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
107+
deferred_events.push(event);
108+
}
109+
} else {
110+
// TODO: we currently ignore the result as we may attempt to
111+
// populate the page that is already present as we may receive
112+
// multiple minor fault events per page.
113+
let _ = uffd_continue(
114+
uffd_handler.uffd.as_raw_fd(),
115+
addr as _,
116+
uffd_handler.page_size as u64,
117+
)
118+
.inspect_err(|err| {
119+
println!("uffdio_continue error: {:?}", err)
120+
});
121+
}
122+
} else {
123+
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
124+
deferred_events.push(event);
125+
}
95126
}
96127
}
97128
userfaultfd::Event::Remove { start, end } => {
@@ -111,6 +142,17 @@ fn main() {
111142
}
112143
}
113144
},
114-
|_uffd_handler: &mut UffdHandler, _offset: usize| {},
145+
|uffd_handler: &mut UffdHandler, offset: usize| {
146+
let bytes_written = uffd_handler.populate_via_write(offset, uffd_handler.page_size);
147+
148+
if bytes_written == 0 {
149+
println!(
150+
"got a vcpu fault for an already populated page at offset {}",
151+
offset
152+
);
153+
} else {
154+
assert_eq!(bytes_written, uffd_handler.page_size);
155+
}
156+
},
115157
);
116158
}

src/firecracker/examples/uffd/uffd_utils.rs

Lines changed: 143 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,47 @@ use std::time::Duration;
2121

2222
use serde::{Deserialize, Serialize};
2323
use userfaultfd::{Error, Event, Uffd};
24+
use vmm_sys_util::ioctl::ioctl_with_mut_ref;
2425
use vmm_sys_util::sock_ctrl_msg::ScmSocket;
26+
use vmm_sys_util::{ioctl_ioc_nr, ioctl_iowr_nr};
2527

2628
use crate::uffd_utils::userfault_bitmap::UserfaultBitmap;
2729

30+
// TODO: remove when UFFDIO_CONTINUE for guest_memfd is available in the crate
31+
#[repr(C)]
32+
struct uffdio_continue {
33+
range: uffdio_range,
34+
mode: u64,
35+
mapped: u64,
36+
}
37+
38+
ioctl_iowr_nr!(UFFDIO_CONTINUE, 0xAA, 0x7, uffdio_continue);
39+
40+
#[repr(C)]
41+
struct uffdio_range {
42+
start: u64,
43+
len: u64,
44+
}
45+
46+
pub fn uffd_continue(uffd: RawFd, fault_addr: u64, len: u64) -> std::io::Result<()> {
47+
let mut cont = uffdio_continue {
48+
range: uffdio_range {
49+
start: fault_addr,
50+
len,
51+
},
52+
mode: 0, // Normal continuation mode
53+
mapped: 0,
54+
};
55+
56+
let ret = unsafe { ioctl_with_mut_ref(&uffd, UFFDIO_CONTINUE(), &mut cont) };
57+
58+
if ret == -1 {
59+
return Err(std::io::Error::last_os_error());
60+
}
61+
62+
Ok(())
63+
}
64+
2865
// This is the same with the one used in src/vmm.
2966
/// This describes the mapping between Firecracker base virtual address and offset in the
3067
/// buffer or file backend for a guest memory region. It is used to tell an external
@@ -119,7 +156,7 @@ pub struct UffdHandler {
119156
pub mem_regions: Vec<GuestRegionUffdMapping>,
120157
pub page_size: usize,
121158
backing_buffer: *const u8,
122-
uffd: Uffd,
159+
pub uffd: Uffd,
123160
removed_pages: HashSet<u64>,
124161
pub guest_memfd: Option<File>,
125162
pub guest_memfd_addr: Option<*mut u8>,
@@ -263,6 +300,20 @@ impl UffdHandler {
263300
}
264301
}
265302

303+
pub fn addr_to_offset(&self, addr: *mut u8) -> u64 {
304+
let addr = addr as u64;
305+
for region in &self.mem_regions {
306+
if region.contains(addr) {
307+
return addr - region.base_host_virt_addr + region.offset as u64;
308+
}
309+
}
310+
311+
panic!(
312+
"Could not find addr: {:#x} within guest region mappings.",
313+
addr
314+
);
315+
}
316+
266317
pub fn serve_pf(&mut self, addr: *mut u8, len: usize) -> bool {
267318
// Find the start of the page that the current faulting address belongs to.
268319
let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void;
@@ -275,7 +326,7 @@ impl UffdHandler {
275326
} else {
276327
for region in self.mem_regions.iter() {
277328
if region.contains(fault_page_addr) {
278-
return self.populate_from_file(region, fault_page_addr, len);
329+
return self.populate_from_file(&region.clone(), fault_page_addr, len);
279330
}
280331
}
281332
}
@@ -290,12 +341,61 @@ impl UffdHandler {
290341
self.mem_regions.iter().map(|r| r.size).sum()
291342
}
292343

293-
fn populate_from_file(&self, region: &GuestRegionUffdMapping, dst: u64, len: usize) -> bool {
294-
let offset = dst - region.base_host_virt_addr;
295-
let src = self.backing_buffer as u64 + region.offset + offset;
344+
pub fn populate_via_write(&mut self, offset: usize, len: usize) -> usize {
345+
// man 2 write:
346+
//
347+
// On Linux, write() (and similar system calls) will transfer at most
348+
// 0x7ffff000 (2,147,479,552) bytes, returning the number of bytes
349+
// actually transferred. (This is true on both 32-bit and 64-bit
350+
// systems.)
351+
const MAX_WRITE_LEN: usize = 2_147_479_552;
352+
353+
assert!(
354+
offset.checked_add(len).unwrap() <= self.size(),
355+
"{} + {} >= {}",
356+
offset,
357+
len,
358+
self.size()
359+
);
360+
361+
let mut total_written = 0;
362+
363+
while total_written < len {
364+
let src = unsafe { self.backing_buffer.add(offset + total_written) };
365+
let len_to_write = (len - total_written).min(MAX_WRITE_LEN);
366+
let bytes_written = unsafe {
367+
libc::pwrite64(
368+
self.guest_memfd.as_ref().unwrap().as_raw_fd(),
369+
src.cast(),
370+
len_to_write,
371+
(offset + total_written) as libc::off64_t,
372+
)
373+
};
374+
375+
let bytes_written = match bytes_written {
376+
-1 if vmm_sys_util::errno::Error::last().errno() == libc::ENOSPC => 0,
377+
written @ 0.. => written as usize,
378+
_ => panic!("{:?}", std::io::Error::last_os_error()),
379+
};
380+
381+
self.userfault_bitmap
382+
.as_mut()
383+
.unwrap()
384+
.reset_addr_range(offset + total_written, bytes_written);
385+
386+
total_written += bytes_written;
387+
388+
if bytes_written != len_to_write {
389+
break;
390+
}
391+
}
392+
393+
total_written
394+
}
296395

396+
fn populate_via_uffdio_copy(&self, src: *const u8, dst: u64, len: usize) -> bool {
297397
unsafe {
298-
match self.uffd.copy(src as *const _, dst as *mut _, len, true) {
398+
match self.uffd.copy(src.cast(), dst as *mut _, len, true) {
299399
// Make sure the UFFD copied some bytes.
300400
Ok(value) => assert!(value > 0),
301401
// Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD
@@ -320,6 +420,42 @@ impl UffdHandler {
320420
true
321421
}
322422

423+
fn populate_via_memcpy(&mut self, src: *const u8, dst: u64, offset: usize, len: usize) -> bool {
424+
let dst_memcpy = unsafe {
425+
self.guest_memfd_addr
426+
.expect("no guest_memfd addr")
427+
.add(offset)
428+
};
429+
430+
unsafe {
431+
std::ptr::copy_nonoverlapping(src, dst_memcpy, len);
432+
}
433+
434+
self.userfault_bitmap
435+
.as_mut()
436+
.unwrap()
437+
.reset_addr_range(offset, len);
438+
439+
uffd_continue(self.uffd.as_raw_fd(), dst, len as u64).expect("uffd_continue");
440+
441+
true
442+
}
443+
444+
fn populate_from_file(
445+
&mut self,
446+
region: &GuestRegionUffdMapping,
447+
dst: u64,
448+
len: usize,
449+
) -> bool {
450+
let offset = (region.offset + dst - region.base_host_virt_addr) as usize;
451+
let src = unsafe { self.backing_buffer.add(offset) };
452+
453+
match self.guest_memfd {
454+
Some(_) => self.populate_via_memcpy(src, dst, offset, len),
455+
None => self.populate_via_uffdio_copy(src, dst, len),
456+
}
457+
}
458+
323459
fn zero_out(&mut self, addr: u64) {
324460
let ret = unsafe {
325461
self.uffd
@@ -666,7 +802,7 @@ mod tests {
666802
let (stream, _) = listener.accept().expect("Cannot listen on UDS socket");
667803
// Update runtime with actual runtime
668804
let runtime = uninit_runtime.write(Runtime::new(stream, file));
669-
runtime.run(|_: &mut UffdHandler| {});
805+
runtime.run(|_: &mut UffdHandler| {}, |_: &mut UffdHandler, _: usize| {});
670806
});
671807

672808
// wait for runtime thread to initialize itself

0 commit comments

Comments
 (0)