Skip to content

Commit 01e381a

Browse files
committed
test(uffd_utils): add handling for FaultRequest in secret freedom
There are two ways a UFFD handler receives a fault notification if Secret Fredom is enabled (which is inferred from 3 fds sent by Firecracker instead of 1): - a VMM- or KVM-triggered fault is delivered via a minor UFFD fault event. The handler is supposed to respond to it via memcpying the content of the page (if the page hasn't already been populated) followed by a UFFDIO_CONTINUE call. - a vCPU-triggered fault is delievered via a FaultRequest message on the UDS socket. The handler is supposed to reply with a pwrite64 call on the guest_memfd to populate the page followed by a FaultReply message on the UDS socket. In both cases, the handler also needs to clear the bit in the userfault bitmap at the corresponding offset in order to stop further fault notifications for the same page. UFFD handlers use the userfault bitmap for two purposes: - communicate to the kernel whether a fault at the corresponding guest_memfd offset will cause a VM exit - keep track of pages that have already been populated in order to avoid overwriting the content of the page that is already initialised. Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent 7d6cbbf commit 01e381a

File tree

3 files changed

+250
-19
lines changed

3 files changed

+250
-19
lines changed

src/firecracker/examples/uffd/fault_all_handler.rs

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@
88
mod uffd_utils;
99

1010
use std::fs::File;
11+
use std::os::fd::AsRawFd;
1112
use std::os::unix::net::UnixListener;
1213

1314
use uffd_utils::{Runtime, UffdHandler};
1415
use utils::time::{ClockType, get_time_us};
1516

17+
use crate::uffd_utils::uffd_continue;
18+
1619
fn main() {
1720
let mut args = std::env::args();
1821
let uffd_sock_path = args.nth(1).expect("No socket path given");
@@ -37,19 +40,69 @@ fn main() {
3740
.expect("Failed to read uffd_msg")
3841
.expect("uffd_msg not ready");
3942

40-
match event {
41-
userfaultfd::Event::Pagefault { .. } => {
42-
let start = get_time_us(ClockType::Monotonic);
43-
for region in uffd_handler.mem_regions.clone() {
44-
uffd_handler.serve_pf(region.base_host_virt_addr as _, region.size);
45-
}
46-
let end = get_time_us(ClockType::Monotonic);
43+
if let userfaultfd::Event::Pagefault { addr, .. } = event {
44+
let bit =
45+
uffd_handler.addr_to_offset(addr.cast()) as usize / uffd_handler.page_size;
46+
47+
// If Secret Free, we know if this is the first fault based on the userfault
48+
// bitmap state. Otherwise, we assume that we will ever only receive a single fault
49+
// event via UFFD.
50+
let are_we_faulted_yet = uffd_handler
51+
.userfault_bitmap
52+
.as_mut()
53+
.map_or(false, |bitmap| !bitmap.is_bit_set(bit));
4754

48-
println!("Finished Faulting All: {}us", end - start);
55+
if are_we_faulted_yet {
56+
// TODO: we currently ignore the result as we may attempt to
57+
// populate the page that is already present as we may receive
58+
// multiple minor fault events per page.
59+
let _ = uffd_continue(
60+
uffd_handler.uffd.as_raw_fd(),
61+
addr as _,
62+
uffd_handler.page_size as u64,
63+
)
64+
.inspect_err(|err| println!("Error during uffdio_continue: {:?}", err));
65+
} else {
66+
fault_all(uffd_handler, addr);
4967
}
50-
_ => panic!("Unexpected event on userfaultfd"),
5168
}
5269
},
5370
|_uffd_handler: &mut UffdHandler, _offset: usize| {},
5471
);
5572
}
73+
74+
fn fault_all(uffd_handler: &mut UffdHandler, fault_addr: *mut libc::c_void) {
75+
let start = get_time_us(ClockType::Monotonic);
76+
for region in uffd_handler.mem_regions.clone() {
77+
match uffd_handler.guest_memfd {
78+
None => {
79+
uffd_handler.serve_pf(region.base_host_virt_addr as _, region.size);
80+
}
81+
Some(_) => {
82+
let written = uffd_handler.populate_via_write(region.offset as usize, region.size);
83+
84+
// This code is written under the assumption that the first fault triggered by
85+
// Firecracker is either due to an MSR write (on x86) or due to device restoration
86+
// reading from guest memory to check the virtio queues are sane (on
87+
// ARM). This will be reported via a UFFD minor fault which needs to
88+
// be handled via memcpy. Importantly, we get to the UFFD handler
89+
// with the actual guest_memfd page already faulted in, meaning pwrite will stop
90+
// once it gets to the offset of that page (e.g. written < region.size above).
91+
// Thus, to fault in everything, we now need to skip this one page, write the
92+
// remaining region, and then deal with the "gap" via uffd_handler.serve_pf().
93+
94+
if written < region.size - uffd_handler.page_size {
95+
let r = uffd_handler.populate_via_write(
96+
region.offset as usize + written + uffd_handler.page_size,
97+
region.size - written - uffd_handler.page_size,
98+
);
99+
assert_eq!(written + r, region.size - uffd_handler.page_size);
100+
}
101+
}
102+
}
103+
}
104+
uffd_handler.serve_pf(fault_addr.cast(), uffd_handler.page_size);
105+
let end = get_time_us(ClockType::Monotonic);
106+
107+
println!("Finished Faulting All: {}us", end - start);
108+
}

src/firecracker/examples/uffd/on_demand_handler.rs

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,13 @@
88
mod uffd_utils;
99

1010
use std::fs::File;
11+
use std::os::fd::AsRawFd;
1112
use std::os::unix::net::UnixListener;
1213

1314
use uffd_utils::{Runtime, UffdHandler};
1415

16+
use crate::uffd_utils::uffd_continue;
17+
1518
fn main() {
1619
let mut args = std::env::args();
1720
let uffd_sock_path = args.nth(1).expect("No socket path given");
@@ -90,8 +93,36 @@ fn main() {
9093
// event (if the balloon device is enabled).
9194
match event {
9295
userfaultfd::Event::Pagefault { addr, .. } => {
93-
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
94-
deferred_events.push(event);
96+
let bit = uffd_handler.addr_to_offset(addr.cast()) as usize
97+
/ uffd_handler.page_size;
98+
99+
if uffd_handler.userfault_bitmap.is_some() {
100+
if uffd_handler
101+
.userfault_bitmap
102+
.as_mut()
103+
.unwrap()
104+
.is_bit_set(bit)
105+
{
106+
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
107+
deferred_events.push(event);
108+
}
109+
} else {
110+
// TODO: we currently ignore the result as we may attempt to
111+
// populate the page that is already present as we may receive
112+
// multiple minor fault events per page.
113+
let _ = uffd_continue(
114+
uffd_handler.uffd.as_raw_fd(),
115+
addr as _,
116+
uffd_handler.page_size as u64,
117+
)
118+
.inspect_err(|err| {
119+
println!("uffdio_continue error: {:?}", err)
120+
});
121+
}
122+
} else {
123+
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
124+
deferred_events.push(event);
125+
}
95126
}
96127
}
97128
userfaultfd::Event::Remove { start, end } => {
@@ -111,6 +142,17 @@ fn main() {
111142
}
112143
}
113144
},
114-
|_uffd_handler: &mut UffdHandler, _offset: usize| {},
145+
|uffd_handler: &mut UffdHandler, offset: usize| {
146+
let bytes_written = uffd_handler.populate_via_write(offset, uffd_handler.page_size);
147+
148+
if bytes_written == 0 {
149+
println!(
150+
"got a vcpu fault for an already populated page at offset {}",
151+
offset
152+
);
153+
} else {
154+
assert_eq!(bytes_written, uffd_handler.page_size);
155+
}
156+
},
115157
);
116158
}

src/firecracker/examples/uffd/uffd_utils.rs

Lines changed: 143 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,47 @@ use std::time::Duration;
2121
use serde::{Deserialize, Serialize};
2222
use serde_json::{Deserializer, StreamDeserializer};
2323
use userfaultfd::{Error, Event, Uffd};
24+
use vmm_sys_util::ioctl::ioctl_with_mut_ref;
2425
use vmm_sys_util::sock_ctrl_msg::ScmSocket;
26+
use vmm_sys_util::{ioctl_ioc_nr, ioctl_iowr_nr};
2527

2628
use crate::uffd_utils::userfault_bitmap::UserfaultBitmap;
2729

30+
// TODO: remove when UFFDIO_CONTINUE for guest_memfd is available in the crate
31+
#[repr(C)]
32+
struct uffdio_continue {
33+
range: uffdio_range,
34+
mode: u64,
35+
mapped: u64,
36+
}
37+
38+
ioctl_iowr_nr!(UFFDIO_CONTINUE, 0xAA, 0x7, uffdio_continue);
39+
40+
#[repr(C)]
41+
struct uffdio_range {
42+
start: u64,
43+
len: u64,
44+
}
45+
46+
pub fn uffd_continue(uffd: RawFd, fault_addr: u64, len: u64) -> std::io::Result<()> {
47+
let mut cont = uffdio_continue {
48+
range: uffdio_range {
49+
start: fault_addr,
50+
len,
51+
},
52+
mode: 0, // Normal continuation mode
53+
mapped: 0,
54+
};
55+
56+
let ret = unsafe { ioctl_with_mut_ref(&uffd, UFFDIO_CONTINUE(), &mut cont) };
57+
58+
if ret == -1 {
59+
return Err(std::io::Error::last_os_error());
60+
}
61+
62+
Ok(())
63+
}
64+
2865
// This is the same with the one used in src/vmm.
2966
/// This describes the mapping between Firecracker base virtual address and offset in the
3067
/// buffer or file backend for a guest memory region. It is used to tell an external
@@ -117,7 +154,7 @@ pub struct UffdHandler {
117154
pub mem_regions: Vec<GuestRegionUffdMapping>,
118155
pub page_size: usize,
119156
backing_buffer: *const u8,
120-
uffd: Uffd,
157+
pub uffd: Uffd,
121158
removed_pages: HashSet<u64>,
122159
pub guest_memfd: Option<File>,
123160
pub guest_memfd_addr: Option<*mut u8>,
@@ -261,6 +298,20 @@ impl UffdHandler {
261298
}
262299
}
263300

301+
pub fn addr_to_offset(&self, addr: *mut u8) -> u64 {
302+
let addr = addr as u64;
303+
for region in &self.mem_regions {
304+
if region.contains(addr) {
305+
return addr - region.base_host_virt_addr + region.offset as u64;
306+
}
307+
}
308+
309+
panic!(
310+
"Could not find addr: {:#x} within guest region mappings.",
311+
addr
312+
);
313+
}
314+
264315
pub fn serve_pf(&mut self, addr: *mut u8, len: usize) -> bool {
265316
// Find the start of the page that the current faulting address belongs to.
266317
let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void;
@@ -273,7 +324,7 @@ impl UffdHandler {
273324
} else {
274325
for region in self.mem_regions.iter() {
275326
if region.contains(fault_page_addr) {
276-
return self.populate_from_file(region, fault_page_addr, len);
327+
return self.populate_from_file(&region.clone(), fault_page_addr, len);
277328
}
278329
}
279330
}
@@ -288,12 +339,61 @@ impl UffdHandler {
288339
self.mem_regions.iter().map(|r| r.size).sum()
289340
}
290341

291-
fn populate_from_file(&self, region: &GuestRegionUffdMapping, dst: u64, len: usize) -> bool {
292-
let offset = dst - region.base_host_virt_addr;
293-
let src = self.backing_buffer as u64 + region.offset + offset;
342+
pub fn populate_via_write(&mut self, offset: usize, len: usize) -> usize {
343+
// man 2 write:
344+
//
345+
// On Linux, write() (and similar system calls) will transfer at most
346+
// 0x7ffff000 (2,147,479,552) bytes, returning the number of bytes
347+
// actually transferred. (This is true on both 32-bit and 64-bit
348+
// systems.)
349+
const MAX_WRITE_LEN: usize = 2_147_479_552;
350+
351+
assert!(
352+
offset.checked_add(len).unwrap() <= self.size(),
353+
"{} + {} >= {}",
354+
offset,
355+
len,
356+
self.size()
357+
);
358+
359+
let mut total_written = 0;
360+
361+
while total_written < len {
362+
let src = unsafe { self.backing_buffer.add(offset + total_written) };
363+
let len_to_write = (len - total_written).min(MAX_WRITE_LEN);
364+
let bytes_written = unsafe {
365+
libc::pwrite64(
366+
self.guest_memfd.as_ref().unwrap().as_raw_fd(),
367+
src.cast(),
368+
len_to_write,
369+
(offset + total_written) as libc::off64_t,
370+
)
371+
};
372+
373+
let bytes_written = match bytes_written {
374+
-1 if vmm_sys_util::errno::Error::last().errno() == libc::ENOSPC => 0,
375+
written @ 0.. => written as usize,
376+
_ => panic!("{:?}", std::io::Error::last_os_error()),
377+
};
378+
379+
self.userfault_bitmap
380+
.as_mut()
381+
.unwrap()
382+
.reset_addr_range(offset + total_written, bytes_written);
383+
384+
total_written += bytes_written;
385+
386+
if bytes_written != len_to_write {
387+
break;
388+
}
389+
}
390+
391+
total_written
392+
}
294393

394+
fn populate_via_uffdio_copy(&self, src: *const u8, dst: u64, len: usize) -> bool {
295395
unsafe {
296-
match self.uffd.copy(src as *const _, dst as *mut _, len, true) {
396+
match self.uffd.copy(src.cast(), dst as *mut _, len, true) {
297397
// Make sure the UFFD copied some bytes.
298398
Ok(value) => assert!(value > 0),
299399
// Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD
@@ -318,6 +418,42 @@ impl UffdHandler {
318418
true
319419
}
320420

421+
fn populate_via_memcpy(&mut self, src: *const u8, dst: u64, offset: usize, len: usize) -> bool {
422+
let dst_memcpy = unsafe {
423+
self.guest_memfd_addr
424+
.expect("no guest_memfd addr")
425+
.add(offset)
426+
};
427+
428+
unsafe {
429+
std::ptr::copy_nonoverlapping(src, dst_memcpy, len);
430+
}
431+
432+
self.userfault_bitmap
433+
.as_mut()
434+
.unwrap()
435+
.reset_addr_range(offset, len);
436+
437+
uffd_continue(self.uffd.as_raw_fd(), dst, len as u64).expect("uffd_continue");
438+
439+
true
440+
}
441+
442+
fn populate_from_file(
443+
&mut self,
444+
region: &GuestRegionUffdMapping,
445+
dst: u64,
446+
len: usize,
447+
) -> bool {
448+
let offset = (region.offset + dst - region.base_host_virt_addr) as usize;
449+
let src = unsafe { self.backing_buffer.add(offset) };
450+
451+
match self.guest_memfd {
452+
Some(_) => self.populate_via_memcpy(src, dst, offset, len),
453+
None => self.populate_via_uffdio_copy(src, dst, len),
454+
}
455+
}
456+
321457
fn zero_out(&mut self, addr: u64) {
322458
let ret = unsafe {
323459
self.uffd
@@ -619,7 +755,7 @@ mod tests {
619755
let (stream, _) = listener.accept().expect("Cannot listen on UDS socket");
620756
// Update runtime with actual runtime
621757
let runtime = uninit_runtime.write(Runtime::new(stream, file));
622-
runtime.run(|_: &mut UffdHandler| {});
758+
runtime.run(|_: &mut UffdHandler| {}, |_: &mut UffdHandler, _: usize| {});
623759
});
624760

625761
// wait for runtime thread to initialize itself

0 commit comments

Comments
 (0)