Skip to content

Commit 1107895

Browse files
committed
fix(example): correctly handle Removed events in uffd exammple
The crux of the issue was that UFFD gets blocked (all ioctls return -EAGAIN) when there's any `remove` events pending in the queue, which means during processing we not only need to look at the "head" of the queue, but also make sure there's no `remove` events in the "tail". Deal with these scenarios correctly by always greedily reading the entire queue, to ensure there's nothing pending, and only then processing things one-by-one. Please see the new code comments for intricacies with this approach. Fixes #4990 Signed-off-by: Patrick Roy <[email protected]>
1 parent 6759168 commit 1107895

File tree

3 files changed

+105
-28
lines changed

3 files changed

+105
-28
lines changed

src/firecracker/examples/uffd/fault_all_handler.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ fn main() {
3636
userfaultfd::Event::Pagefault { .. } => {
3737
for region in uffd_handler.mem_regions.clone() {
3838
uffd_handler
39-
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size)
39+
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size);
4040
}
4141
}
4242
_ => panic!("Unexpected event on userfaultfd"),

src/firecracker/examples/uffd/uffd_utils.rs

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ impl UffdHandler {
116116
}
117117
}
118118

119-
pub fn serve_pf(&mut self, addr: *mut u8, len: usize) {
119+
pub fn serve_pf(&mut self, addr: *mut u8, len: usize) -> bool {
120120
// Find the start of the page that the current faulting address belongs to.
121121
let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void;
122122
let fault_page_addr = dst as u64;
@@ -133,14 +133,18 @@ impl UffdHandler {
133133
// event was received. This can be a consequence of guest reclaiming back its
134134
// memory from the host (through balloon device)
135135
Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => {
136-
let (start, end) = self.populate_from_file(region, fault_page_addr, len);
137-
self.update_mem_state_mappings(start, end, MemPageState::FromFile);
138-
return;
136+
match self.populate_from_file(region, fault_page_addr, len) {
137+
Some((start, end)) => {
138+
self.update_mem_state_mappings(start, end, MemPageState::FromFile)
139+
}
140+
None => return false,
141+
}
142+
return true;
139143
}
140144
Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => {
141145
let (start, end) = self.zero_out(fault_page_addr);
142146
self.update_mem_state_mappings(start, end, MemPageState::Anonymous);
143-
return;
147+
return true;
144148
}
145149
None => {}
146150
}
@@ -152,20 +156,39 @@ impl UffdHandler {
152156
);
153157
}
154158

155-
fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> (u64, u64) {
159+
fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> Option<(u64, u64)> {
156160
let offset = dst - region.mapping.base_host_virt_addr;
157161
let src = self.backing_buffer as u64 + region.mapping.offset + offset;
158162

159163
let ret = unsafe {
160-
self.uffd
161-
.copy(src as *const _, dst as *mut _, len, true)
162-
.expect("Uffd copy failed")
164+
match self.uffd.copy(src as *const _, dst as *mut _, len, true) {
165+
Ok(value) => value,
166+
// Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD
167+
// queue while we're processing `pagefault` events.
168+
// The weird cast is because the `bytes_copied` field is based on the
169+
// `uffdio_copy->copy` field, which is a signed 64 bit integer, and if something
170+
// goes wrong, it gets set to a -errno code. However, uffd-rs always casts this
171+
// value to an unsigned `usize`, which scrambled the errno.
172+
Err(Error::PartiallyCopied(bytes_copied))
173+
if bytes_copied == 0 || bytes_copied == (-libc::EAGAIN) as usize =>
174+
{
175+
return None
176+
}
177+
Err(Error::CopyFailed(errno))
178+
if std::io::Error::from(errno).raw_os_error().unwrap() == libc::EEXIST =>
179+
{
180+
len
181+
}
182+
Err(e) => {
183+
panic!("Uffd copy failed: {e:?}");
184+
}
185+
}
163186
};
164187

165188
// Make sure the UFFD copied some bytes.
166189
assert!(ret > 0);
167190

168-
(dst, dst + len as u64)
191+
Some((dst, dst + len as u64))
169192
}
170193

171194
fn zero_out(&mut self, addr: u64) -> (u64, u64) {

src/firecracker/examples/uffd/valid_handler.rs

Lines changed: 71 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,24 +26,78 @@ fn main() {
2626
let mut runtime = Runtime::new(stream, file);
2727
runtime.install_panic_hook();
2828
runtime.run(|uffd_handler: &mut UffdHandler| {
29-
// Read an event from the userfaultfd.
30-
let event = uffd_handler
31-
.read_event()
32-
.expect("Failed to read uffd_msg")
33-
.expect("uffd_msg not ready");
34-
35-
// We expect to receive either a Page Fault or Removed
36-
// event (if the balloon device is enabled).
37-
match event {
38-
userfaultfd::Event::Pagefault { addr, .. } => {
39-
uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size)
29+
// !DISCLAIMER!
30+
// When using UFFD together with the balloon device, this handler needs to deal with Removed
31+
// and Pagefault events. There are multiple things to keep in mind in such setups:
32+
//
33+
// As long as any Removed event in pending in the UFFD queue, all ioctls return EAGAIN
34+
// -----------------------------------------------------------------------------------
35+
//
36+
// This means we cannot process UFFD events simply one-by-one anymore - if a `remove` event
37+
// arrives, we need to pre-fetch all other events up to the `remove` event, to unblock the
38+
// UFFD, and then go back to the process the pre-fetched events.
39+
//
40+
// UFFD might receive events in not in their causal order
41+
// -----------------------------------------------------
42+
//
43+
// For example, the guest
44+
// kernel might first respond to a balloon inflation by freeing some memory, and
45+
// telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the
46+
// free memory range, which causes a Removed event to be sent to UFFD. Then, the
47+
// guest kernel might immediately fault the page in again (for example because
48+
// default_on_oom was set). which causes a Pagefault event to be sent to UFFD.
49+
//
50+
// However, the pagefault will be triggered from inside KVM on the vCPU thread, while the
51+
// balloon device is handled by Firecracker on its VMM thread. This means that potentially
52+
// this handler can receive the Pagefault _before_ the Removed event.
53+
//
54+
// This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events
55+
// to make sure no `remove` event is blocking us can result in the handler acting on
56+
// the `pagefault` event before the `remove` message (despite the `remove` event being
57+
// in the causal past of the `pagefault` event), which means that we will fault in a page
58+
// from the snapshot file, while really we should be faulting in a zero page.
59+
//
60+
// In this example handler, we ignore this problem, to avoid
61+
// complexity (under the assumption that the guest kernel will zero a newly faulted in
62+
// page anyway). A production handler will most likely want to ensure that Removed
63+
// events for a specific range are always handled before Pagefault events.
64+
//
65+
// Lastly, we still need to deal with the race condition where a Removed event arrives
66+
// in the UFFD queue after we got done reading all events, in which case we need to go
67+
// back to reading more events before we can continue processing Pagefaults.
68+
let mut deferred_events = Vec::new();
69+
70+
loop {
71+
// First, try events that we couldn't handle last round
72+
let mut events_to_handle = Vec::from_iter(deferred_events.drain(..));
73+
74+
// Read all events from the userfaultfd.
75+
while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg") {
76+
events_to_handle.push(event);
77+
}
78+
79+
for event in events_to_handle.drain(..) {
80+
// We expect to receive either a Page Fault or Removed
81+
// event (if the balloon device is enabled).
82+
match event {
83+
userfaultfd::Event::Pagefault { addr, .. } => {
84+
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
85+
deferred_events.push(event);
86+
}
87+
}
88+
userfaultfd::Event::Remove { start, end } => uffd_handler
89+
.update_mem_state_mappings(start as u64, end as u64, MemPageState::Removed),
90+
_ => panic!("Unexpected event on userfaultfd"),
91+
}
92+
}
93+
94+
// We assume that really only the above removed/pagefault interaction can result in
95+
// deferred events. In that scenario, the loop will always terminate (unless
96+
// newly arriving Removed events end up indefinitely blocking it, but there's nothing
97+
// we can do about that, and it's a largely theoretical problem).
98+
if deferred_events.is_empty() {
99+
break;
40100
}
41-
userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings(
42-
start as u64,
43-
end as u64,
44-
MemPageState::Removed,
45-
),
46-
_ => panic!("Unexpected event on userfaultfd"),
47101
}
48102
});
49103
}

0 commit comments

Comments
 (0)