Skip to content

Commit 5fa89be

Browse files
kalyazinroypat
authored andcommitted
feat(vmm): implement secret-free fault handling protocol
It contains two parts: - external: between the VMM thread and the UFFD handler - internal: between vCPUs and the VMM thread An outline of the workflow: - When a vCPU fault occurs, vCPU exits to userspace - The vCPU thread sends sends the exit syndrome in the vCPU to VMM channel and writes to the eventfd - The VMM thread forwards the syndrome to the UFFD handler via the UDS socket - The UFFD handler populates the page, clears the corresponding bit in the userfault bitmap and sends a reply to Firecracker - The VMM thread receives the reply and updates a vCPU condvar to notify the vCPU that the fault has been resolved - The vCPU resumes execution Note that as a result of this change, an ability to exit the VM gracefully is lost (at least on x86). In the existing implementation, the VMM thread initiated an exit if an event was read from the eventfd, but no VcpuResponse::Exited responses were read for unknown reason. Since the exit_evt eventfd is now also used by vCPUs to notify the VMM thread of the VM exits caused by pagefaults, this situation (an eventfd event, but response in the channel) can occur also because we have read all VcpuResponse::Userfault in response to the previous eventfd event. Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent fe0d665 commit 5fa89be

File tree

6 files changed

+269
-39
lines changed

6 files changed

+269
-39
lines changed

src/vmm/src/builder.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,8 @@ pub fn build_microvm_for_boot(
187187
// Set up Kvm Vm and register memory regions.
188188
// Build custom CPU config if a custom template is provided.
189189
let mut vm = Vm::new(&kvm, secret_free)?;
190-
let (mut vcpus, vcpus_exit_evt) = vm.create_vcpus(vm_resources.machine_config.vcpu_count)?;
190+
let (mut vcpus, vcpus_exit_evt) =
191+
vm.create_vcpus(vm_resources.machine_config.vcpu_count, secret_free)?;
191192

192193
let guest_memfd = match secret_free {
193194
true => Some(
@@ -548,7 +549,7 @@ pub fn build_microvm_from_snapshot(
548549
let mut vm = Vm::new(&kvm, secret_free).map_err(StartMicrovmError::Vm)?;
549550

550551
let (mut vcpus, vcpus_exit_evt) = vm
551-
.create_vcpus(vm_resources.machine_config.vcpu_count)
552+
.create_vcpus(vm_resources.machine_config.vcpu_count, secret_free)
552553
.map_err(StartMicrovmError::Vm)?;
553554

554555
let guest_memfd = match secret_free {
@@ -939,7 +940,7 @@ pub(crate) mod tests {
939940
pub(crate) fn default_vmm() -> Vmm {
940941
let (kvm, mut vm) = setup_vm_with_memory(mib_to_bytes(128));
941942

942-
let (_, vcpus_exit_evt) = vm.create_vcpus(1).unwrap();
943+
let (_, vcpus_exit_evt) = vm.create_vcpus(1, false).unwrap();
943944

944945
Vmm {
945946
instance_info: InstanceInfo::default(),

src/vmm/src/lib.rs

Lines changed: 148 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ pub mod vstate;
115115
pub mod initrd;
116116

117117
use std::collections::HashMap;
118-
use std::io;
118+
use std::io::{self, Read, Write};
119+
use std::os::fd::RawFd;
119120
use std::os::unix::io::AsRawFd;
120121
use std::os::unix::net::UnixStream;
121122
use std::sync::mpsc::RecvTimeoutError;
@@ -128,6 +129,7 @@ use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEvent
128129
use seccomp::BpfProgram;
129130
use snapshot::Persist;
130131
use userfaultfd::Uffd;
132+
use vm_memory::GuestAddress;
131133
use vmm_sys_util::epoll::EventSet;
132134
use vmm_sys_util::eventfd::EventFd;
133135
use vmm_sys_util::terminal::Terminal;
@@ -139,12 +141,15 @@ use crate::devices::virtio::balloon::{BALLOON_DEV_ID, Balloon, BalloonConfig, Ba
139141
use crate::devices::virtio::block::device::Block;
140142
use crate::devices::virtio::net::Net;
141143
use crate::logger::{METRICS, MetricsError, error, info, warn};
142-
use crate::persist::{MicrovmState, MicrovmStateError, VmInfo};
144+
use crate::persist::{FaultReply, FaultRequest, MicrovmState, MicrovmStateError, VmInfo};
143145
use crate::rate_limiter::BucketUpdate;
144146
use crate::vmm_config::instance_info::{InstanceInfo, VmState};
145-
use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion};
147+
use crate::vstate::memory::{
148+
GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion,
149+
};
146150
use crate::vstate::vcpu::VcpuState;
147151
pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse};
152+
use crate::vstate::vm::UserfaultData;
148153
pub use crate::vstate::vm::Vm;
149154

150155
/// Shorthand type for the EventManager flavour used by Firecracker.
@@ -633,6 +638,111 @@ impl Vmm {
633638
self.shutdown_exit_code = Some(exit_code);
634639
}
635640

641+
fn process_vcpu_userfault(&mut self, vcpu: u32, userfault_data: UserfaultData) {
642+
let offset = self
643+
.vm
644+
.guest_memory()
645+
.gpa_to_offset(GuestAddress(userfault_data.gpa))
646+
.expect("Failed to convert GPA to offset");
647+
648+
let fault_request = FaultRequest {
649+
vcpu,
650+
offset,
651+
flags: userfault_data.flags,
652+
token: None,
653+
};
654+
let fault_request_json =
655+
serde_json::to_string(&fault_request).expect("Failed to serialize fault request");
656+
657+
let written = self
658+
.uffd_socket
659+
.as_ref()
660+
.expect("Uffd socket is not set")
661+
.write(fault_request_json.as_bytes())
662+
.expect("Failed to write to uffd socket");
663+
664+
if written != fault_request_json.len() {
665+
panic!(
666+
"Failed to write the entire fault request to the uffd socket: expected {}, \
667+
written {}",
668+
fault_request_json.len(),
669+
written
670+
);
671+
}
672+
}
673+
674+
fn active_event_in_uffd_socket(&self, source: RawFd, event_set: EventSet) -> bool {
675+
if let Some(uffd_socket) = &self.uffd_socket {
676+
uffd_socket.as_raw_fd() == source && event_set == EventSet::IN
677+
} else {
678+
false
679+
}
680+
}
681+
682+
fn process_uffd_socket(&mut self) {
683+
const BUFFER_SIZE: usize = 4096;
684+
685+
let stream = self.uffd_socket.as_mut().expect("Uffd socket is not set");
686+
687+
let mut buffer = [0u8; BUFFER_SIZE];
688+
let mut current_pos = 0;
689+
690+
loop {
691+
if current_pos < BUFFER_SIZE {
692+
match stream.read(&mut buffer[current_pos..]) {
693+
Ok(0) => break,
694+
Ok(n) => current_pos += n,
695+
Err(e) if e.kind() == io::ErrorKind::WouldBlock => {
696+
if current_pos == 0 {
697+
break;
698+
}
699+
}
700+
Err(e) => panic!("Read error: {}", e),
701+
}
702+
}
703+
704+
let mut parser = serde_json::Deserializer::from_slice(&buffer[..current_pos])
705+
.into_iter::<FaultReply>();
706+
let mut total_consumed = 0;
707+
let mut needs_more = false;
708+
709+
while let Some(result) = parser.next() {
710+
match result {
711+
Ok(fault_reply) => {
712+
let vcpu = fault_reply.vcpu.expect("vCPU must be set");
713+
714+
self.vcpus_handles
715+
.get(vcpu as usize)
716+
.expect("Invalid vcpu index")
717+
.send_userfault_resolved();
718+
719+
total_consumed = parser.byte_offset();
720+
}
721+
Err(e) if e.is_eof() => {
722+
needs_more = true;
723+
break;
724+
}
725+
Err(e) => {
726+
println!(
727+
"Buffer content: {:?}",
728+
std::str::from_utf8(&buffer[..current_pos])
729+
);
730+
panic!("Invalid JSON: {}", e);
731+
}
732+
}
733+
}
734+
735+
if total_consumed > 0 {
736+
buffer.copy_within(total_consumed..current_pos, 0);
737+
current_pos -= total_consumed;
738+
}
739+
740+
if needs_more {
741+
continue;
742+
}
743+
}
744+
}
745+
636746
/// Gets a reference to kvm-ioctls Vm
637747
#[cfg(feature = "gdb")]
638748
pub fn vm(&self) -> &Vm {
@@ -710,38 +820,55 @@ impl MutEventSubscriber for Vmm {
710820
let event_set = event.event_set();
711821

712822
if source == self.vcpus_exit_evt.as_raw_fd() && event_set == EventSet::IN {
713-
// Exit event handling should never do anything more than call 'self.stop()'.
714823
let _ = self.vcpus_exit_evt.read();
715824

716-
let exit_code = 'exit_code: {
717-
// Query each vcpu for their exit_code.
718-
for handle in &self.vcpus_handles {
719-
// Drain all vcpu responses that are pending from this vcpu until we find an
720-
// exit status.
721-
for response in handle.response_receiver().try_iter() {
722-
if let VcpuResponse::Exited(status) = response {
723-
// It could be that some vcpus exited successfully while others
724-
// errored out. Thus make sure that error exits from one vcpu always
725-
// takes precedence over "ok" exits
825+
let mut pending_userfaults = Vec::with_capacity(self.vcpus_handles.len());
826+
let mut should_exit = false;
827+
let mut final_exit_code = FcExitCode::Ok;
828+
829+
// First pass: collect all responses and determine exit status
830+
for (handle, index) in self.vcpus_handles.iter().zip(0u32..) {
831+
for response in handle.response_receiver().try_iter() {
832+
match response {
833+
VcpuResponse::Exited(status) => {
834+
should_exit = true;
726835
if status != FcExitCode::Ok {
727-
break 'exit_code status;
836+
final_exit_code = status;
728837
}
729838
}
839+
VcpuResponse::Userfault(userfault_data) => {
840+
pending_userfaults.push((index, userfault_data));
841+
}
842+
_ => panic!("Unexpected response from vcpu: {:?}", response),
730843
}
731844
}
845+
}
732846

733-
// No CPUs exited with error status code, report "Ok"
734-
FcExitCode::Ok
735-
};
736-
self.stop(exit_code);
737-
} else {
738-
error!("Spurious EventManager event for handler: Vmm");
847+
// Process any pending userfaults
848+
for (index, userfault_data) in pending_userfaults {
849+
self.process_vcpu_userfault(index, userfault_data);
850+
}
851+
852+
// Stop if we received an exit event
853+
if should_exit {
854+
self.stop(final_exit_code);
855+
}
856+
}
857+
858+
if self.active_event_in_uffd_socket(source, event_set) {
859+
self.process_uffd_socket();
739860
}
740861
}
741862

742863
fn init(&mut self, ops: &mut EventOps) {
743864
if let Err(err) = ops.add(Events::new(&self.vcpus_exit_evt, EventSet::IN)) {
744865
error!("Failed to register vmm exit event: {}", err);
745866
}
867+
868+
if let Some(uffd_socket) = self.uffd_socket.as_ref()
869+
&& let Err(err) = ops.add(Events::new(uffd_socket, EventSet::IN))
870+
{
871+
panic!("Failed to register UFFD socket: {}", err);
872+
}
746873
}
747874
}

src/vmm/src/persist.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,10 @@ fn send_uffd_handshake(
583583
let backend_mappings = serde_json::to_string(backend_mappings).unwrap();
584584

585585
let socket = UnixStream::connect(mem_uds_path)?;
586+
socket
587+
.set_nonblocking(true)
588+
.expect("Cannot set non-blocking");
589+
586590
socket.send_with_fds(
587591
&[backend_mappings.as_bytes()],
588592
// In the happy case we can close the fd since the other process has it open and is

0 commit comments

Comments
 (0)