@@ -26,24 +26,79 @@ fn main() {
2626    let  mut  runtime = Runtime :: new ( stream,  file) ; 
2727    runtime. install_panic_hook ( ) ; 
2828    runtime. run ( |uffd_handler :  & mut  UffdHandler | { 
29-         // Read an event from the userfaultfd. 
30-         let  event = uffd_handler
31-             . read_event ( ) 
32-             . expect ( "Failed to read uffd_msg" ) 
33-             . expect ( "uffd_msg not ready" ) ; 
34- 
35-         // We expect to receive either a Page Fault or Removed 
36-         // event (if the balloon device is enabled). 
37-         match  event { 
38-             userfaultfd:: Event :: Pagefault  {  addr,  .. }  => { 
39-                 uffd_handler. serve_pf ( addr. cast ( ) ,  uffd_handler. page_size ) 
29+         // !DISCLAIMER! 
30+         // When using UFFD together with the balloon device, this handler needs to deal with 
31+         // `remove` and `pagefault` events. There are multiple things to keep in mind in 
32+         // such setups: 
33+         // 
34+         // As long as any `remove` event is pending in the UFFD queue, all ioctls return EAGAIN 
35+         // ----------------------------------------------------------------------------------- 
36+         // 
37+         // This means we cannot process UFFD events simply one-by-one anymore - if a `remove` event 
38+         // arrives, we need to pre-fetch all other events up to the `remove` event, to unblock the 
39+         // UFFD, and then go back to the process the pre-fetched events. 
40+         // 
41+         // UFFD might receive events in not in their causal order 
42+         // ----------------------------------------------------- 
43+         // 
44+         // For example, the guest 
45+         // kernel might first respond to a balloon inflation by freeing some memory, and 
46+         // telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the 
47+         // free memory range, which causes a `remove` event to be sent to UFFD. Then, the 
48+         // guest kernel might immediately fault the page in again (for example because 
49+         // default_on_oom was set). which causes a `pagefault` event to be sent to UFFD. 
50+         // 
51+         // However, the pagefault will be triggered from inside KVM on the vCPU thread, while the 
52+         // balloon device is handled by Firecracker on its VMM thread. This means that potentially 
53+         // this handler can receive the `pagefault` _before_ the `remove` event. 
54+         // 
55+         // This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events 
56+         // to make sure no `remove` event is blocking us can result in the handler acting on 
57+         // the `pagefault` event before the `remove` message (despite the `remove` event being 
58+         // in the causal past of the `pagefault` event), which means that we will fault in a page 
59+         // from the snapshot file, while really we should be faulting in a zero page. 
60+         // 
61+         // In this example handler, we ignore this problem, to avoid 
62+         // complexity (under the assumption that the guest kernel will zero a newly faulted in 
63+         // page anyway). A production handler will most likely want to ensure that `remove` 
64+         // events for a specific range are always handled before `pagefault` events. 
65+         // 
66+         // Lastly, we still need to deal with the race condition where a `remove` event arrives 
67+         // in the UFFD queue after we got done reading all events, in which case we need to go 
68+         // back to reading more events before we can continue processing `pagefault`s. 
69+         let  mut  deferred_events = Vec :: new ( ) ; 
70+ 
71+         loop  { 
72+             // First, try events that we couldn't handle last round 
73+             let  mut  events_to_handle = Vec :: from_iter ( deferred_events. drain ( ..) ) ; 
74+ 
75+             // Read all events from the userfaultfd. 
76+             while  let  Some ( event)  = uffd_handler. read_event ( ) . expect ( "Failed to read uffd_msg" )  { 
77+                 events_to_handle. push ( event) ; 
78+             } 
79+ 
80+             for  event in  events_to_handle. drain ( ..)  { 
81+                 // We expect to receive either a Page Fault or `remove` 
82+                 // event (if the balloon device is enabled). 
83+                 match  event { 
84+                     userfaultfd:: Event :: Pagefault  {  addr,  .. }  => { 
85+                         if  !uffd_handler. serve_pf ( addr. cast ( ) ,  uffd_handler. page_size )  { 
86+                             deferred_events. push ( event) ; 
87+                         } 
88+                     } 
89+                     userfaultfd:: Event :: Remove  {  start,  end }  => uffd_handler
90+                         . update_mem_state_mappings ( start as  u64 ,  end as  u64 ,  MemPageState :: Removed ) , 
91+                     _ => panic ! ( "Unexpected event on userfaultfd" ) , 
92+                 } 
93+             } 
94+ 
95+             // We assume that really only the above removed/pagefault interaction can result in 
96+             // deferred events. In that scenario, the loop will always terminate (unless 
97+             // newly arriving `remove` events end up indefinitely blocking it, but there's nothing 
98+             // we can do about that, and it's a largely theoretical problem). 
99+             if  deferred_events. is_empty ( )  { 
100+                 break ; 
40101            } 
41-             userfaultfd:: Event :: Remove  {  start,  end }  => uffd_handler. update_mem_state_mappings ( 
42-                 start as  u64 , 
43-                 end as  u64 , 
44-                 MemPageState :: Removed , 
45-             ) , 
46-             _ => panic ! ( "Unexpected event on userfaultfd" ) , 
47102        } 
48103    } ) ; 
49104} 
0 commit comments