Skip to content

Commit 441a078

Browse files
committed
fix(example): correctly handle Removed events in uffd exammple
The UFFD handler might receive events out of order compared to how they actually happened. For example, if the guest first frees a page to the balloon device, and then immediately faults it in again, the UFFD handler might see the page fault before the freeing. This is a problem, as any pending `Remove` events in the queue will "block" the userfault FD (all ioctls return -EAGAIN). Fix this by always draining all events from the fd's queue, and gracefully handling -EAGAIN. Please see the code comment for in-depth analysis of the flow. Fixes #4990 Signed-off-by: Patrick Roy <[email protected]>
1 parent 6759168 commit 441a078

File tree

3 files changed

+92
-28
lines changed

3 files changed

+92
-28
lines changed

src/firecracker/examples/uffd/fault_all_handler.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ fn main() {
3636
userfaultfd::Event::Pagefault { .. } => {
3737
for region in uffd_handler.mem_regions.clone() {
3838
uffd_handler
39-
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size)
39+
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size);
4040
}
4141
}
4242
_ => panic!("Unexpected event on userfaultfd"),

src/firecracker/examples/uffd/uffd_utils.rs

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ impl UffdHandler {
116116
}
117117
}
118118

119-
pub fn serve_pf(&mut self, addr: *mut u8, len: usize) {
119+
pub fn serve_pf(&mut self, addr: *mut u8, len: usize) -> bool {
120120
// Find the start of the page that the current faulting address belongs to.
121121
let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void;
122122
let fault_page_addr = dst as u64;
@@ -133,14 +133,18 @@ impl UffdHandler {
133133
// event was received. This can be a consequence of guest reclaiming back its
134134
// memory from the host (through balloon device)
135135
Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => {
136-
let (start, end) = self.populate_from_file(region, fault_page_addr, len);
137-
self.update_mem_state_mappings(start, end, MemPageState::FromFile);
138-
return;
136+
match self.populate_from_file(region, fault_page_addr, len) {
137+
Some((start, end)) => {
138+
self.update_mem_state_mappings(start, end, MemPageState::FromFile)
139+
}
140+
None => return false,
141+
}
142+
return true;
139143
}
140144
Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => {
141145
let (start, end) = self.zero_out(fault_page_addr);
142146
self.update_mem_state_mappings(start, end, MemPageState::Anonymous);
143-
return;
147+
return true;
144148
}
145149
None => {}
146150
}
@@ -152,20 +156,34 @@ impl UffdHandler {
152156
);
153157
}
154158

155-
fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> (u64, u64) {
159+
fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> Option<(u64, u64)> {
156160
let offset = dst - region.mapping.base_host_virt_addr;
157161
let src = self.backing_buffer as u64 + region.mapping.offset + offset;
158162

159163
let ret = unsafe {
160-
self.uffd
161-
.copy(src as *const _, dst as *mut _, len, true)
162-
.expect("Uffd copy failed")
164+
match self.uffd.copy(src as *const _, dst as *mut _, len, true) {
165+
Ok(value) => value,
166+
// Catch EAGAIN errors, which occur when there is a copy racing with a EVENT_REMOVE.
167+
// We need to defer this page fault until after we handle the remove event, because
168+
// from the guest's PoV they happened in opposite order (e.g. first
169+
// balloon expanded and caused page deallocation, then the guest
170+
// faulted the page back in).
171+
Err(Error::PartiallyCopied(bytes_copied)) if bytes_copied == 0 => return None,
172+
Err(Error::CopyFailed(errno))
173+
if std::io::Error::from(errno).raw_os_error().unwrap() == libc::EEXIST =>
174+
{
175+
len
176+
}
177+
Err(e) => {
178+
panic!("Uffd copy failed: {e:?}");
179+
}
180+
}
163181
};
164182

165183
// Make sure the UFFD copied some bytes.
166184
assert!(ret > 0);
167185

168-
(dst, dst + len as u64)
186+
Some((dst, dst + len as u64))
169187
}
170188

171189
fn zero_out(&mut self, addr: u64) -> (u64, u64) {

src/firecracker/examples/uffd/valid_handler.rs

Lines changed: 63 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,24 +26,70 @@ fn main() {
2626
let mut runtime = Runtime::new(stream, file);
2727
runtime.install_panic_hook();
2828
runtime.run(|uffd_handler: &mut UffdHandler| {
29-
// Read an event from the userfaultfd.
30-
let event = uffd_handler
31-
.read_event()
32-
.expect("Failed to read uffd_msg")
33-
.expect("uffd_msg not ready");
34-
35-
// We expect to receive either a Page Fault or Removed
36-
// event (if the balloon device is enabled).
37-
match event {
38-
userfaultfd::Event::Pagefault { addr, .. } => {
39-
uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size)
29+
// !DISCLAIMER!
30+
// When using UFFD together with the balloon device, this handler needs to deal with Removed
31+
// and Pagefault events. However, since these are induced by different threads over in
32+
// Firecracker-land they might get here in the wrong order: For example, the guest
33+
// kernel might first respond to a balloon inflation by freeing some memory, and
34+
// telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the
35+
// free memory range, which causes a Removed event to be sent to UFFD. Then, the
36+
// guest kernel might immediately fault the page in again, which causes a Pagefault
37+
// event to be sent to UFFD.
38+
//
39+
// However, the pagefault will be triggered from inside KVM on the vCPU thread, while the
40+
// balloon device is handled by Firecracker on its VMM thread. This means that potentially
41+
// this handler can receive the Pagefault _before_ the Removed event.
42+
//
43+
// This leads to two problems:
44+
// 1. While a Removed event is pending (e.g. in the fd's queue, but not read yet), all UFFD
45+
// ioctls such as UFFDIO_COPY will return -EAGAIN
46+
// 2. Processing a Pagefault event before a Removed event where the order has been swapped
47+
// as above means that we will fault in a page from the snapshot file, while really we
48+
// should be faulting in a zero page.
49+
//
50+
// Problem 1. is solved fairly easily by simply reading all available events ahead of time
51+
// to unblock the UFFD. Problem 2. we are ignoring in this example handler, to avoid
52+
// complexity (under the assumption that the guest kernel will zero a newly faulted in
53+
// page anyway). A production handler will most likely want to ensure that Removed
54+
// events for a specific range are always handled before Pagefault events.
55+
//
56+
// Lastly, we still need to deal with the race condition where a Removed event arrives
57+
// in the UFFD queue after we got done reading all events, in which case we need to go
58+
// back to reading more events before we can continue processing Pagefaults.
59+
60+
let mut deferred_events = Vec::new();
61+
62+
loop {
63+
// First, try events that we couldn't handle last round
64+
let mut events_to_handle = Vec::from_iter(deferred_events.drain(..));
65+
66+
// Read all events from the userfaultfd.
67+
while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg") {
68+
events_to_handle.push(event);
69+
}
70+
71+
for event in events_to_handle.drain(..) {
72+
// We expect to receive either a Page Fault or Removed
73+
// event (if the balloon device is enabled).
74+
match event {
75+
userfaultfd::Event::Pagefault { addr, .. } => {
76+
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
77+
deferred_events.push(event);
78+
}
79+
}
80+
userfaultfd::Event::Remove { start, end } => uffd_handler
81+
.update_mem_state_mappings(start as u64, end as u64, MemPageState::Removed),
82+
_ => panic!("Unexpected event on userfaultfd"),
83+
}
84+
}
85+
86+
// We assume that really only the above removed/pagefault interaction can result in
87+
// deferred events. In that scenario, the loop will always terminate (unless
88+
// newly arriving Removed events end up indefinitely blocking it, but there's nothing
89+
// we can do about that, and it's a largely theoretical problem).
90+
if deferred_events.is_empty() {
91+
break;
4092
}
41-
userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings(
42-
start as u64,
43-
end as u64,
44-
MemPageState::Removed,
45-
),
46-
_ => panic!("Unexpected event on userfaultfd"),
4793
}
4894
});
4995
}

0 commit comments

Comments
 (0)