@@ -26,24 +26,78 @@ fn main() {
2626 let mut runtime = Runtime :: new ( stream, file) ;
2727 runtime. install_panic_hook ( ) ;
2828 runtime. run ( |uffd_handler : & mut UffdHandler | {
29- // Read an event from the userfaultfd.
30- let event = uffd_handler
31- . read_event ( )
32- . expect ( "Failed to read uffd_msg" )
33- . expect ( "uffd_msg not ready" ) ;
34-
35- // We expect to receive either a Page Fault or Removed
36- // event (if the balloon device is enabled).
37- match event {
38- userfaultfd:: Event :: Pagefault { addr, .. } => {
39- uffd_handler. serve_pf ( addr. cast ( ) , uffd_handler. page_size )
29+ // !DISCLAIMER!
30+ // When using UFFD together with the balloon device, this handler needs to deal with Removed
31+ // and Pagefault events. There are multiple things to keep in mind in such setups:
32+ //
33+ // As long as any Removed event in pending in the UFFD queue, all ioctls return EAGAIN
34+ // -----------------------------------------------------------------------------------
35+ //
36+ // This means we cannot process UFFD events simply one-by-one anymore - if a `remove` event
37+ // arrives, we need to pre-fetch all other events up to the `remove` event, to unblock the
38+ // UFFD, and then go back to the process the pre-fetched events.
39+ //
40+ // UFFD might receive events in not in their causal order
41+ // -----------------------------------------------------
42+ //
43+ // For example, the guest
44+ // kernel might first respond to a balloon inflation by freeing some memory, and
45+ // telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the
46+ // free memory range, which causes a Removed event to be sent to UFFD. Then, the
47+ // guest kernel might immediately fault the page in again (for example because
48+ // default_on_oom was set). which causes a Pagefault event to be sent to UFFD.
49+ //
50+ // However, the pagefault will be triggered from inside KVM on the vCPU thread, while the
51+ // balloon device is handled by Firecracker on its VMM thread. This means that potentially
52+ // this handler can receive the Pagefault _before_ the Removed event.
53+ //
54+ // This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events
55+ // to make sure no `remove` event is blocking us can result in the handler acting on
56+ // the `pagefault` event before the `remove` message (despite the `remove` event being
57+ // in the causal past of the `pagefault` event), which means that we will fault in a page
58+ // from the snapshot file, while really we should be faulting in a zero page.
59+ //
60+ // In this example handler, we ignore this problem, to avoid
61+ // complexity (under the assumption that the guest kernel will zero a newly faulted in
62+ // page anyway). A production handler will most likely want to ensure that Removed
63+ // events for a specific range are always handled before Pagefault events.
64+ //
65+ // Lastly, we still need to deal with the race condition where a Removed event arrives
66+ // in the UFFD queue after we got done reading all events, in which case we need to go
67+ // back to reading more events before we can continue processing Pagefaults.
68+ let mut deferred_events = Vec :: new ( ) ;
69+
70+ loop {
71+ // First, try events that we couldn't handle last round
72+ let mut events_to_handle = Vec :: from_iter ( deferred_events. drain ( ..) ) ;
73+
74+ // Read all events from the userfaultfd.
75+ while let Some ( event) = uffd_handler. read_event ( ) . expect ( "Failed to read uffd_msg" ) {
76+ events_to_handle. push ( event) ;
77+ }
78+
79+ for event in events_to_handle. drain ( ..) {
80+ // We expect to receive either a Page Fault or Removed
81+ // event (if the balloon device is enabled).
82+ match event {
83+ userfaultfd:: Event :: Pagefault { addr, .. } => {
84+ if !uffd_handler. serve_pf ( addr. cast ( ) , uffd_handler. page_size ) {
85+ deferred_events. push ( event) ;
86+ }
87+ }
88+ userfaultfd:: Event :: Remove { start, end } => uffd_handler
89+ . update_mem_state_mappings ( start as u64 , end as u64 , MemPageState :: Removed ) ,
90+ _ => panic ! ( "Unexpected event on userfaultfd" ) ,
91+ }
92+ }
93+
94+ // We assume that really only the above removed/pagefault interaction can result in
95+ // deferred events. In that scenario, the loop will always terminate (unless
96+ // newly arriving Removed events end up indefinitely blocking it, but there's nothing
97+ // we can do about that, and it's a largely theoretical problem).
98+ if deferred_events. is_empty ( ) {
99+ break ;
40100 }
41- userfaultfd:: Event :: Remove { start, end } => uffd_handler. update_mem_state_mappings (
42- start as u64 ,
43- end as u64 ,
44- MemPageState :: Removed ,
45- ) ,
46- _ => panic ! ( "Unexpected event on userfaultfd" ) ,
47101 }
48102 } ) ;
49103}
0 commit comments