Skip to content

Commit c67eca0

Browse files
committed
feat(vmm): implement secret-free fault handling protocol
It contains two parts: - external: between the VMM thread and the UFFD handler - internal: between vCPUs and the VMM thread An outline of the workflow: - When a vCPU fault occurs, vCPU exits to userspace - The vCPU thread sends sends the exit syndrome in the vCPU to VMM channel and writes to the eventfd - The VMM thread forwards the syndrome to the UFFD handler via the UDS socket - The UFFD handler populates the page, clears the corresponding bit in the userfault bitmap and sends a reply to Firecracker - The VMM thread receives the reply and updates a vCPU condvar to notify the vCPU that the fault has been resolved - The vCPU resumes execution Note that as a result of this change, an ability to exit the VM gracefully is lost (at least on x86). In the existing implementation, the VMM thread initiated an exit if an event was read from the eventfd, but no VcpuResponse::Exited responses were read for unknown reason. Since the exit_evt eventfd is now also used by vCPUs to notify the VMM thread of the VM exits caused by pagefaults, this situation (an eventfd event, but response in the channel) can occur also because we have read all VcpuResponse::Userfault in response to the previous eventfd event. Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent 10102ae commit c67eca0

File tree

6 files changed

+306
-41
lines changed

6 files changed

+306
-41
lines changed

src/vmm/src/builder.rs

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
//! Enables pre-boot setup, instantiation and booting of a Firecracker VMM.
55
66
use std::fmt::Debug;
7+
use std::fs::File;
78
use std::io::{self, Write};
8-
use std::os::fd::AsFd;
9+
use std::os::fd::{AsFd, AsRawFd};
910
use std::os::unix::fs::MetadataExt;
1011
#[cfg(feature = "gdb")]
1112
use std::sync::mpsc;
@@ -162,7 +163,7 @@ fn create_vmm_and_vcpus(
162163
// Instantiate ACPI device manager.
163164
let acpi_device_manager = ACPIDeviceManager::new();
164165

165-
let (vcpus, vcpus_exit_evt) = vm.create_vcpus(vcpu_count)?;
166+
let (vcpus, vcpus_exit_evt) = vm.create_vcpus(vcpu_count, secret_free)?;
166167

167168
#[cfg(target_arch = "x86_64")]
168169
let pio_device_manager = {
@@ -482,6 +483,41 @@ pub enum BuildMicrovmFromSnapshotError {
482483
UserfaultBitmapMemfd(#[from] crate::vstate::memory::MemoryError),
483484
}
484485

486+
fn memfd_to_slice(memfd: &Option<File>) -> Option<&[u8]> {
487+
if let Some(bitmap_file) = memfd {
488+
let len = u64_to_usize(
489+
bitmap_file
490+
.metadata()
491+
.expect("Failed to get metadata")
492+
.len(),
493+
);
494+
495+
// SAFETY: the arguments to mmap cannot cause any memory unsafety in the rust sense
496+
let bitmap_addr = unsafe {
497+
libc::mmap(
498+
std::ptr::null_mut(),
499+
len,
500+
libc::PROT_WRITE,
501+
libc::MAP_SHARED,
502+
bitmap_file.as_raw_fd(),
503+
0,
504+
)
505+
};
506+
507+
if bitmap_addr == libc::MAP_FAILED {
508+
panic!(
509+
"Failed to mmap userfault bitmap file: {}",
510+
std::io::Error::last_os_error()
511+
);
512+
}
513+
514+
// SAFETY: `bitmap_addr` is a valid memory address returned by `mmap`.
515+
Some(unsafe { std::slice::from_raw_parts(bitmap_addr as *const u8, len) })
516+
} else {
517+
None
518+
}
519+
}
520+
485521
/// Builds and starts a microVM based on the provided MicrovmState.
486522
///
487523
/// An `Arc` reference of the built `Vmm` is also plugged in the `EventManager`, while another
@@ -527,7 +563,7 @@ pub fn build_microvm_from_snapshot(
527563
};
528564

529565
let userfault_bitmap_memfd = if secret_free {
530-
let bitmap_size = vm_resources.memory_size() / host_page_size();
566+
let bitmap_size = vm_resources.memory_size() / host_page_size() / u8::BITS as usize;
531567
let bitmap_file = create_memfd(bitmap_size as u64, None)?;
532568

533569
// Set all bits so a fault on any page will cause a VM exit
@@ -580,8 +616,10 @@ pub fn build_microvm_from_snapshot(
580616
}
581617
};
582618

619+
let userfault_bitmap = memfd_to_slice(&userfault_bitmap_memfd);
620+
583621
vmm.vm
584-
.register_memory_regions(guest_memory, userfault_bitmap_memfd.as_ref())
622+
.register_memory_regions(guest_memory, userfault_bitmap)
585623
.map_err(VmmError::Vm)
586624
.map_err(StartMicrovmError::Internal)?;
587625
vmm.uffd = uffd;
@@ -1045,7 +1083,7 @@ pub(crate) mod tests {
10451083
)
10461084
.unwrap();
10471085

1048-
let (_, vcpus_exit_evt) = vm.create_vcpus(1).unwrap();
1086+
let (_, vcpus_exit_evt) = vm.create_vcpus(1, false).unwrap();
10491087

10501088
Vmm {
10511089
events_observer: Some(std::io::stdin()),

src/vmm/src/lib.rs

Lines changed: 148 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ pub mod vstate;
115115
pub mod initrd;
116116

117117
use std::collections::HashMap;
118-
use std::io;
118+
use std::io::{self, Read, Write};
119+
use std::os::fd::RawFd;
119120
use std::os::unix::io::AsRawFd;
120121
use std::os::unix::net::UnixStream;
121122
use std::sync::mpsc::RecvTimeoutError;
@@ -128,6 +129,7 @@ use devices::acpi::vmgenid::VmGenIdError;
128129
use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber};
129130
use seccomp::BpfProgram;
130131
use userfaultfd::Uffd;
132+
use vm_memory::GuestAddress;
131133
use vmm_sys_util::epoll::EventSet;
132134
use vmm_sys_util::eventfd::EventFd;
133135
use vmm_sys_util::terminal::Terminal;
@@ -147,13 +149,16 @@ use crate::devices::virtio::block::device::Block;
147149
use crate::devices::virtio::net::Net;
148150
use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET};
149151
use crate::logger::{METRICS, MetricsError, error, info, warn};
150-
use crate::persist::{MicrovmState, MicrovmStateError, VmInfo};
152+
use crate::persist::{FaultReply, FaultRequest, MicrovmState, MicrovmStateError, VmInfo};
151153
use crate::rate_limiter::BucketUpdate;
152154
use crate::snapshot::Persist;
153155
use crate::vmm_config::instance_info::{InstanceInfo, VmState};
154-
use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion};
156+
use crate::vstate::memory::{
157+
GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion,
158+
};
155159
use crate::vstate::vcpu::VcpuState;
156160
pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse};
161+
use crate::vstate::vm::UserfaultData;
157162
pub use crate::vstate::vm::Vm;
158163

159164
/// Shorthand type for the EventManager flavour used by Firecracker.
@@ -800,6 +805,111 @@ impl Vmm {
800805
self.shutdown_exit_code = Some(exit_code);
801806
}
802807

808+
fn process_vcpu_userfault(&mut self, vcpu: usize, userfault_data: UserfaultData) {
809+
let offset = self
810+
.vm
811+
.guest_memory()
812+
.gpa_to_offset(GuestAddress(userfault_data.gpa))
813+
.expect("Failed to convert GPA to offset");
814+
815+
let fault_request = FaultRequest {
816+
vcpu: vcpu.try_into().expect("Invalid vCPU index"),
817+
offset,
818+
flags: userfault_data.flags,
819+
token: None,
820+
};
821+
let fault_request_json =
822+
serde_json::to_string(&fault_request).expect("Failed to serialize fault request");
823+
824+
let written = self
825+
.uffd_socket
826+
.as_ref()
827+
.expect("Uffd socket is not set")
828+
.write(fault_request_json.as_bytes())
829+
.expect("Failed to write to uffd socket");
830+
831+
if written != fault_request_json.len() {
832+
panic!(
833+
"Failed to write the entire fault request to the uffd socket: expected {}, \
834+
written {}",
835+
fault_request_json.len(),
836+
written
837+
);
838+
}
839+
}
840+
841+
fn active_event_in_uffd_socket(&self, source: RawFd, event_set: EventSet) -> bool {
842+
if let Some(uffd_socket) = &self.uffd_socket {
843+
uffd_socket.as_raw_fd() == source && event_set == EventSet::IN
844+
} else {
845+
false
846+
}
847+
}
848+
849+
fn process_uffd_socket(&mut self) {
850+
const BUFFER_SIZE: usize = 4096;
851+
852+
let stream = self.uffd_socket.as_mut().expect("Uffd socket is not set");
853+
854+
let mut buffer = [0u8; BUFFER_SIZE];
855+
let mut current_pos = 0;
856+
857+
loop {
858+
if current_pos < BUFFER_SIZE {
859+
match stream.read(&mut buffer[current_pos..]) {
860+
Ok(0) => break,
861+
Ok(n) => current_pos += n,
862+
Err(e) if e.kind() == io::ErrorKind::WouldBlock => {
863+
if current_pos == 0 {
864+
break;
865+
}
866+
}
867+
Err(e) => panic!("Read error: {}", e),
868+
}
869+
}
870+
871+
let mut parser = serde_json::Deserializer::from_slice(&buffer[..current_pos])
872+
.into_iter::<FaultReply>();
873+
let mut total_consumed = 0;
874+
let mut needs_more = false;
875+
876+
while let Some(result) = parser.next() {
877+
match result {
878+
Ok(fault_reply) => {
879+
let vcpu = fault_reply.vcpu.expect("vCPU must be set");
880+
881+
self.vcpus_handles
882+
.get(vcpu as usize)
883+
.expect("Invalid vcpu index")
884+
.send_userfault_resolved();
885+
886+
total_consumed = parser.byte_offset();
887+
}
888+
Err(e) if e.is_eof() => {
889+
needs_more = true;
890+
break;
891+
}
892+
Err(e) => {
893+
println!(
894+
"Buffer content: {:?}",
895+
std::str::from_utf8(&buffer[..current_pos])
896+
);
897+
panic!("Invalid JSON: {}", e);
898+
}
899+
}
900+
}
901+
902+
if total_consumed > 0 {
903+
buffer.copy_within(total_consumed..current_pos, 0);
904+
current_pos -= total_consumed;
905+
}
906+
907+
if needs_more {
908+
continue;
909+
}
910+
}
911+
}
912+
803913
/// Gets a reference to kvm-ioctls Vm
804914
#[cfg(feature = "gdb")]
805915
pub fn vm(&self) -> &Vm {
@@ -882,38 +992,55 @@ impl MutEventSubscriber for Vmm {
882992
let event_set = event.event_set();
883993

884994
if source == self.vcpus_exit_evt.as_raw_fd() && event_set == EventSet::IN {
885-
// Exit event handling should never do anything more than call 'self.stop()'.
886995
let _ = self.vcpus_exit_evt.read();
887996

888-
let exit_code = 'exit_code: {
889-
// Query each vcpu for their exit_code.
890-
for handle in &self.vcpus_handles {
891-
// Drain all vcpu responses that are pending from this vcpu until we find an
892-
// exit status.
893-
for response in handle.response_receiver().try_iter() {
894-
if let VcpuResponse::Exited(status) = response {
895-
// It could be that some vcpus exited successfully while others
896-
// errored out. Thus make sure that error exits from one vcpu always
897-
// takes precedence over "ok" exits
997+
let mut pending_userfaults = Vec::with_capacity(self.vcpus_handles.len());
998+
let mut should_exit = false;
999+
let mut final_exit_code = FcExitCode::Ok;
1000+
1001+
// First pass: collect all responses and determine exit status
1002+
for (index, handle) in self.vcpus_handles.iter().enumerate() {
1003+
for response in handle.response_receiver().try_iter() {
1004+
match response {
1005+
VcpuResponse::Exited(status) => {
1006+
should_exit = true;
8981007
if status != FcExitCode::Ok {
899-
break 'exit_code status;
1008+
final_exit_code = status;
9001009
}
9011010
}
1011+
VcpuResponse::Userfault(userfault_data) => {
1012+
pending_userfaults.push((index, userfault_data));
1013+
}
1014+
_ => panic!("Unexpected response from vcpu: {:?}", response),
9021015
}
9031016
}
1017+
}
9041018

905-
// No CPUs exited with error status code, report "Ok"
906-
FcExitCode::Ok
907-
};
908-
self.stop(exit_code);
909-
} else {
910-
error!("Spurious EventManager event for handler: Vmm");
1019+
// Process any pending userfaults
1020+
for (index, userfault_data) in pending_userfaults {
1021+
self.process_vcpu_userfault(index, userfault_data);
1022+
}
1023+
1024+
// Stop if we received an exit event
1025+
if should_exit {
1026+
self.stop(final_exit_code);
1027+
}
1028+
}
1029+
1030+
if self.active_event_in_uffd_socket(source, event_set) {
1031+
self.process_uffd_socket();
9111032
}
9121033
}
9131034

9141035
fn init(&mut self, ops: &mut EventOps) {
9151036
if let Err(err) = ops.add(Events::new(&self.vcpus_exit_evt, EventSet::IN)) {
9161037
error!("Failed to register vmm exit event: {}", err);
9171038
}
1039+
1040+
if let Some(uffd_socket) = self.uffd_socket.as_ref() {
1041+
if let Err(err) = ops.add(Events::new(uffd_socket, EventSet::IN)) {
1042+
panic!("Failed to register UFFD socket: {}", err);
1043+
}
1044+
}
9181045
}
9191046
}

src/vmm/src/persist.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,10 @@ fn send_uffd_handshake(
649649
let backend_mappings = serde_json::to_string(backend_mappings).unwrap();
650650

651651
let socket = UnixStream::connect(mem_uds_path)?;
652+
socket
653+
.set_nonblocking(true)
654+
.expect("Cannot set non-blocking");
655+
652656
socket.send_with_fds(
653657
&[backend_mappings.as_bytes()],
654658
// In the happy case we can close the fd since the other process has it open and is

0 commit comments

Comments
 (0)