@@ -26,24 +26,70 @@ fn main() {
2626    let  mut  runtime = Runtime :: new ( stream,  file) ; 
2727    runtime. install_panic_hook ( ) ; 
2828    runtime. run ( |uffd_handler :  & mut  UffdHandler | { 
29-         // Read an event from the userfaultfd. 
30-         let  event = uffd_handler
31-             . read_event ( ) 
32-             . expect ( "Failed to read uffd_msg" ) 
33-             . expect ( "uffd_msg not ready" ) ; 
34- 
35-         // We expect to receive either a Page Fault or Removed 
36-         // event (if the balloon device is enabled). 
37-         match  event { 
38-             userfaultfd:: Event :: Pagefault  {  addr,  .. }  => { 
39-                 uffd_handler. serve_pf ( addr. cast ( ) ,  uffd_handler. page_size ) 
29+         // !DISCLAIMER! 
30+         // When using UFFD together with the balloon device, this handler needs to deal with Removed 
31+         // and Pagefault events. However, since these are induced by different threads over in 
32+         // Firecracker-land they might get here in the wrong order: For example, the guest 
33+         // kernel might first respond to a balloon inflation by freeing some memory, and 
34+         // telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the 
35+         // free memory range, which causes a Removed event to be sent to UFFD. Then, the 
36+         // guest kernel might immediately fault the page in again, which causes a Pagefault 
37+         // event to be sent to UFFD. 
38+         // 
39+         // However, the pagefault will be triggered from inside KVM on the vCPU thread, while the 
40+         // balloon device is handled by Firecracker on its VMM thread. This means that potentially 
41+         // this handler can receive the Pagefault _before_ the Removed event. 
42+         // 
43+         // This leads to two problems: 
44+         // 1. While a Removed event is pending (e.g. in the fd's queue, but not read yet), all UFFD 
45+         //    ioctls such as UFFDIO_COPY will return -EAGAIN 
46+         // 2. Processing a Pagefault event before a Removed event where the order has been swapped 
47+         //    as above means that we will fault in a page from the snapshot file, while really we 
48+         //    should be faulting in a zero page. 
49+         // 
50+         // Problem 1. is solved fairly easily by simply reading all available events ahead of time 
51+         // to unblock the UFFD. Problem 2. we are ignoring in this example handler, to avoid 
52+         // complexity (under the assumption that the guest kernel will zero a newly faulted in 
53+         // page anyway). A production handler will most likely want to ensure that Removed 
54+         // events for a specific range are always handled before Pagefault events. 
55+         // 
56+         // Lastly, we still need to deal with the race condition where a Removed event arrives 
57+         // in the UFFD queue after we got done reading all events, in which case we need to go 
58+         // back to reading more events before we can continue processing Pagefaults. 
59+ 
60+         let  mut  deferred_events = Vec :: new ( ) ; 
61+ 
62+         loop  { 
63+             // First, try events that we couldn't handle last round 
64+             let  mut  events_to_handle = Vec :: from_iter ( deferred_events. drain ( ..) ) ; 
65+ 
66+             // Read all events from the userfaultfd. 
67+             while  let  Some ( event)  = uffd_handler. read_event ( ) . expect ( "Failed to read uffd_msg" )  { 
68+                 events_to_handle. push ( event) ; 
69+             } 
70+ 
71+             for  event in  events_to_handle. drain ( ..)  { 
72+                 // We expect to receive either a Page Fault or Removed 
73+                 // event (if the balloon device is enabled). 
74+                 match  event { 
75+                     userfaultfd:: Event :: Pagefault  {  addr,  .. }  => { 
76+                         if  !uffd_handler. serve_pf ( addr. cast ( ) ,  uffd_handler. page_size )  { 
77+                             deferred_events. push ( event) ; 
78+                         } 
79+                     } 
80+                     userfaultfd:: Event :: Remove  {  start,  end }  => uffd_handler
81+                         . update_mem_state_mappings ( start as  u64 ,  end as  u64 ,  MemPageState :: Removed ) , 
82+                     _ => panic ! ( "Unexpected event on userfaultfd" ) , 
83+                 } 
84+             } 
85+ 
86+             // We assume that really only the above removed/pagefault interaction can result in 
87+             // deferred events. In that scenario, the loop will always terminate (unless 
88+             // newly arriving Removed events end up indefinitely blocking it, but there's nothing 
89+             // we can do about that, and it's a largely theoretical problem). 
90+             if  deferred_events. is_empty ( )  { 
91+                 break ; 
4092            } 
41-             userfaultfd:: Event :: Remove  {  start,  end }  => uffd_handler. update_mem_state_mappings ( 
42-                 start as  u64 , 
43-                 end as  u64 , 
44-                 MemPageState :: Removed , 
45-             ) , 
46-             _ => panic ! ( "Unexpected event on userfaultfd" ) , 
4793        } 
4894    } ) ; 
4995} 
0 commit comments