Skip to content

Commit c9ecb4a

Browse files
authored
Merge branch 'main' into memory-cleanup
2 parents 6a14208 + f6bd4b6 commit c9ecb4a

File tree

11 files changed

+140
-41
lines changed

11 files changed

+140
-41
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ and this project adheres to
4545
- [#5007](https://github.com/firecracker-microvm/firecracker/pull/5007): Fixed
4646
watchdog softlockup warning on x86_64 guests when a vCPU is paused during GDB
4747
debugging.
48+
- [#5021](https://github.com/firecracker-microvm/firecracker/pull/5021) If a
49+
balloon device is inflated post UFFD-backed snapshot restore, Firecracker now
50+
causes `remove` UFFD messages to be sent to the UFFD handler. Previously, no
51+
such message would be sent.
4852

4953
## [1.10.1]
5054

src/firecracker/examples/uffd/fault_all_handler.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ fn main() {
3636
userfaultfd::Event::Pagefault { .. } => {
3737
for region in uffd_handler.mem_regions.clone() {
3838
uffd_handler
39-
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size)
39+
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size);
4040
}
4141
}
4242
_ => panic!("Unexpected event on userfaultfd"),

src/firecracker/examples/uffd/uffd_utils.rs

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ impl UffdHandler {
116116
}
117117
}
118118

119-
pub fn serve_pf(&mut self, addr: *mut u8, len: usize) {
119+
pub fn serve_pf(&mut self, addr: *mut u8, len: usize) -> bool {
120120
// Find the start of the page that the current faulting address belongs to.
121121
let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void;
122122
let fault_page_addr = dst as u64;
@@ -133,14 +133,18 @@ impl UffdHandler {
133133
// event was received. This can be a consequence of guest reclaiming back its
134134
// memory from the host (through balloon device)
135135
Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => {
136-
let (start, end) = self.populate_from_file(region, fault_page_addr, len);
137-
self.update_mem_state_mappings(start, end, MemPageState::FromFile);
138-
return;
136+
match self.populate_from_file(region, fault_page_addr, len) {
137+
Some((start, end)) => {
138+
self.update_mem_state_mappings(start, end, MemPageState::FromFile)
139+
}
140+
None => return false,
141+
}
142+
return true;
139143
}
140144
Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => {
141145
let (start, end) = self.zero_out(fault_page_addr);
142146
self.update_mem_state_mappings(start, end, MemPageState::Anonymous);
143-
return;
147+
return true;
144148
}
145149
None => {}
146150
}
@@ -152,20 +156,39 @@ impl UffdHandler {
152156
);
153157
}
154158

155-
fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> (u64, u64) {
159+
fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> Option<(u64, u64)> {
156160
let offset = dst - region.mapping.base_host_virt_addr;
157161
let src = self.backing_buffer as u64 + region.mapping.offset + offset;
158162

159163
let ret = unsafe {
160-
self.uffd
161-
.copy(src as *const _, dst as *mut _, len, true)
162-
.expect("Uffd copy failed")
164+
match self.uffd.copy(src as *const _, dst as *mut _, len, true) {
165+
Ok(value) => value,
166+
// Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD
167+
// queue while we're processing `pagefault` events.
168+
// The weird cast is because the `bytes_copied` field is based on the
169+
// `uffdio_copy->copy` field, which is a signed 64 bit integer, and if something
170+
// goes wrong, it gets set to a -errno code. However, uffd-rs always casts this
171+
// value to an unsigned `usize`, which scrambled the errno.
172+
Err(Error::PartiallyCopied(bytes_copied))
173+
if bytes_copied == 0 || bytes_copied == (-libc::EAGAIN) as usize =>
174+
{
175+
return None
176+
}
177+
Err(Error::CopyFailed(errno))
178+
if std::io::Error::from(errno).raw_os_error().unwrap() == libc::EEXIST =>
179+
{
180+
len
181+
}
182+
Err(e) => {
183+
panic!("Uffd copy failed: {e:?}");
184+
}
185+
}
163186
};
164187

165188
// Make sure the UFFD copied some bytes.
166189
assert!(ret > 0);
167190

168-
(dst, dst + len as u64)
191+
Some((dst, dst + len as u64))
169192
}
170193

171194
fn zero_out(&mut self, addr: u64) -> (u64, u64) {

src/firecracker/examples/uffd/valid_handler.rs

Lines changed: 72 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,24 +26,79 @@ fn main() {
2626
let mut runtime = Runtime::new(stream, file);
2727
runtime.install_panic_hook();
2828
runtime.run(|uffd_handler: &mut UffdHandler| {
29-
// Read an event from the userfaultfd.
30-
let event = uffd_handler
31-
.read_event()
32-
.expect("Failed to read uffd_msg")
33-
.expect("uffd_msg not ready");
34-
35-
// We expect to receive either a Page Fault or Removed
36-
// event (if the balloon device is enabled).
37-
match event {
38-
userfaultfd::Event::Pagefault { addr, .. } => {
39-
uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size)
29+
// !DISCLAIMER!
30+
// When using UFFD together with the balloon device, this handler needs to deal with
31+
// `remove` and `pagefault` events. There are multiple things to keep in mind in
32+
// such setups:
33+
//
34+
// As long as any `remove` event is pending in the UFFD queue, all ioctls return EAGAIN
35+
// -----------------------------------------------------------------------------------
36+
//
37+
// This means we cannot process UFFD events simply one-by-one anymore - if a `remove` event
38+
// arrives, we need to pre-fetch all other events up to the `remove` event, to unblock the
39+
// UFFD, and then go back to the process the pre-fetched events.
40+
//
41+
// UFFD might receive events in not in their causal order
42+
// -----------------------------------------------------
43+
//
44+
// For example, the guest
45+
// kernel might first respond to a balloon inflation by freeing some memory, and
46+
// telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the
47+
// free memory range, which causes a `remove` event to be sent to UFFD. Then, the
48+
// guest kernel might immediately fault the page in again (for example because
49+
// default_on_oom was set). which causes a `pagefault` event to be sent to UFFD.
50+
//
51+
// However, the pagefault will be triggered from inside KVM on the vCPU thread, while the
52+
// balloon device is handled by Firecracker on its VMM thread. This means that potentially
53+
// this handler can receive the `pagefault` _before_ the `remove` event.
54+
//
55+
// This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events
56+
// to make sure no `remove` event is blocking us can result in the handler acting on
57+
// the `pagefault` event before the `remove` message (despite the `remove` event being
58+
// in the causal past of the `pagefault` event), which means that we will fault in a page
59+
// from the snapshot file, while really we should be faulting in a zero page.
60+
//
61+
// In this example handler, we ignore this problem, to avoid
62+
// complexity (under the assumption that the guest kernel will zero a newly faulted in
63+
// page anyway). A production handler will most likely want to ensure that `remove`
64+
// events for a specific range are always handled before `pagefault` events.
65+
//
66+
// Lastly, we still need to deal with the race condition where a `remove` event arrives
67+
// in the UFFD queue after we got done reading all events, in which case we need to go
68+
// back to reading more events before we can continue processing `pagefault`s.
69+
let mut deferred_events = Vec::new();
70+
71+
loop {
72+
// First, try events that we couldn't handle last round
73+
let mut events_to_handle = Vec::from_iter(deferred_events.drain(..));
74+
75+
// Read all events from the userfaultfd.
76+
while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg") {
77+
events_to_handle.push(event);
78+
}
79+
80+
for event in events_to_handle.drain(..) {
81+
// We expect to receive either a Page Fault or `remove`
82+
// event (if the balloon device is enabled).
83+
match event {
84+
userfaultfd::Event::Pagefault { addr, .. } => {
85+
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
86+
deferred_events.push(event);
87+
}
88+
}
89+
userfaultfd::Event::Remove { start, end } => uffd_handler
90+
.update_mem_state_mappings(start as u64, end as u64, MemPageState::Removed),
91+
_ => panic!("Unexpected event on userfaultfd"),
92+
}
93+
}
94+
95+
// We assume that really only the above removed/pagefault interaction can result in
96+
// deferred events. In that scenario, the loop will always terminate (unless
97+
// newly arriving `remove` events end up indefinitely blocking it, but there's nothing
98+
// we can do about that, and it's a largely theoretical problem).
99+
if deferred_events.is_empty() {
100+
break;
40101
}
41-
userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings(
42-
start as u64,
43-
end as u64,
44-
MemPageState::Removed,
45-
),
46-
_ => panic!("Unexpected event on userfaultfd"),
47102
}
48103
});
49104
}

src/vmm/src/builder.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,7 @@ pub fn build_microvm_from_snapshot(
528528
resource_allocator: &mut vmm.resource_allocator,
529529
vm_resources,
530530
instance_id: &instance_info.id,
531+
restored_from_file: vmm.uffd.is_none(),
531532
};
532533

533534
vmm.mmio_device_manager =

src/vmm/src/device_manager/persist.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ pub struct MMIODevManagerConstructorArgs<'a> {
214214
pub resource_allocator: &'a mut ResourceAllocator,
215215
pub vm_resources: &'a mut VmResources,
216216
pub instance_id: &'a str,
217+
pub restored_from_file: bool,
217218
}
218219
impl fmt::Debug for MMIODevManagerConstructorArgs<'_> {
219220
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -512,7 +513,10 @@ impl<'a> Persist<'a> for MMIODeviceManager {
512513

513514
if let Some(balloon_state) = &state.balloon_device {
514515
let device = Arc::new(Mutex::new(Balloon::restore(
515-
BalloonConstructorArgs { mem: mem.clone() },
516+
BalloonConstructorArgs {
517+
mem: mem.clone(),
518+
restored_from_file: constructor_args.restored_from_file,
519+
},
516520
&balloon_state.device_state,
517521
)?));
518522

@@ -807,6 +811,7 @@ mod tests {
807811
resource_allocator: &mut resource_allocator,
808812
vm_resources,
809813
instance_id: "microvm-id",
814+
restored_from_file: true,
810815
};
811816
let restored_dev_manager =
812817
MMIODeviceManager::restore(restore_args, &device_states).unwrap();

src/vmm/src/devices/virtio/balloon/device.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ pub struct Balloon {
164164
pub(crate) irq_trigger: IrqTrigger,
165165

166166
// Implementation specific fields.
167-
pub(crate) restored: bool,
167+
pub(crate) restored_from_file: bool,
168168
pub(crate) stats_polling_interval_s: u16,
169169
pub(crate) stats_timer: TimerFd,
170170
// The index of the previous stats descriptor is saved because
@@ -189,7 +189,7 @@ impl fmt::Debug for Balloon {
189189
.field("queue_evts", &self.queue_evts)
190190
.field("device_state", &self.device_state)
191191
.field("irq_trigger", &self.irq_trigger)
192-
.field("restored", &self.restored)
192+
.field("restored_from_file", &self.restored_from_file)
193193
.field("stats_polling_interval_s", &self.stats_polling_interval_s)
194194
.field("stats_desc_index", &self.stats_desc_index)
195195
.field("latest_stats", &self.latest_stats)
@@ -204,7 +204,7 @@ impl Balloon {
204204
amount_mib: u32,
205205
deflate_on_oom: bool,
206206
stats_polling_interval_s: u16,
207-
restored: bool,
207+
restored_from_file: bool,
208208
) -> Result<Balloon, BalloonError> {
209209
let mut avail_features = 1u64 << VIRTIO_F_VERSION_1;
210210

@@ -245,7 +245,7 @@ impl Balloon {
245245
irq_trigger: IrqTrigger::new().map_err(BalloonError::EventFd)?,
246246
device_state: DeviceState::Inactive,
247247
activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?,
248-
restored,
248+
restored_from_file,
249249
stats_polling_interval_s,
250250
stats_timer,
251251
stats_desc_index: None,
@@ -355,7 +355,7 @@ impl Balloon {
355355
if let Err(err) = remove_range(
356356
mem,
357357
(guest_addr, u64::from(range_len) << VIRTIO_BALLOON_PFN_SHIFT),
358-
self.restored,
358+
self.restored_from_file,
359359
) {
360360
error!("Error removing memory range: {:?}", err);
361361
}

src/vmm/src/devices/virtio/balloon/persist.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ pub struct BalloonState {
9595
pub struct BalloonConstructorArgs {
9696
/// Pointer to guest memory.
9797
pub mem: GuestMemoryMmap,
98+
pub restored_from_file: bool,
9899
}
99100

100101
impl Persist<'_> for Balloon {
@@ -121,7 +122,12 @@ impl Persist<'_> for Balloon {
121122
) -> Result<Self, Self::Error> {
122123
// We can safely create the balloon with arbitrary flags and
123124
// num_pages because we will overwrite them after.
124-
let mut balloon = Balloon::new(0, false, state.stats_polling_interval_s, true)?;
125+
let mut balloon = Balloon::new(
126+
0,
127+
false,
128+
state.stats_polling_interval_s,
129+
constructor_args.restored_from_file,
130+
)?;
125131

126132
let mut num_queues = BALLOON_NUM_QUEUES;
127133
// As per the virtio 1.1 specification, the statistics queue
@@ -192,13 +198,16 @@ mod tests {
192198

193199
// Deserialize and restore the balloon device.
194200
let restored_balloon = Balloon::restore(
195-
BalloonConstructorArgs { mem: guest_mem },
201+
BalloonConstructorArgs {
202+
mem: guest_mem,
203+
restored_from_file: true,
204+
},
196205
&Snapshot::deserialize(&mut mem.as_slice()).unwrap(),
197206
)
198207
.unwrap();
199208

200209
assert_eq!(restored_balloon.device_type(), TYPE_BALLOON);
201-
assert!(restored_balloon.restored);
210+
assert!(restored_balloon.restored_from_file);
202211

203212
assert_eq!(restored_balloon.acked_features, balloon.acked_features);
204213
assert_eq!(restored_balloon.avail_features, balloon.avail_features);

src/vmm/src/devices/virtio/balloon/util.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ pub(crate) fn compact_page_frame_numbers(v: &mut [u32]) -> Vec<(u32, u32)> {
6868
pub(crate) fn remove_range(
6969
guest_memory: &GuestMemoryMmap,
7070
range: (GuestAddress, u64),
71-
restored: bool,
71+
restored_from_file: bool,
7272
) -> Result<(), RemoveRegionError> {
7373
let (guest_address, range_len) = range;
7474

@@ -83,7 +83,7 @@ pub(crate) fn remove_range(
8383
// Mmap a new anonymous region over the present one in order to create a hole.
8484
// This workaround is (only) needed after resuming from a snapshot because the guest memory
8585
// is mmaped from file as private and there is no `madvise` flag that works for this case.
86-
if restored {
86+
if restored_from_file {
8787
// SAFETY: The address and length are known to be valid.
8888
let ret = unsafe {
8989
libc::mmap(

src/vmm/src/lib.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,6 @@ pub struct Vmm {
314314
vm: Vm,
315315
guest_memory: GuestMemoryMmap,
316316
// Save UFFD in order to keep it open in the Firecracker process, as well.
317-
// Since this field is never read again, we need to allow `dead_code`.
318-
#[allow(dead_code)]
319317
uffd: Option<Uffd>,
320318
vcpus_handles: Vec<VcpuHandle>,
321319
// Used by Vcpus and devices to initiate teardown; Vmm should never write here.

0 commit comments

Comments
 (0)