Skip to content

Commit d94cfd7

Browse files
kalyazinroypat
authored andcommitted
feat(vmm): implement secret-free fault handling protocol
It contains two parts: - external: between the VMM thread and the UFFD handler - internal: between vCPUs and the VMM thread An outline of the workflow: - When a vCPU fault occurs, vCPU exits to userspace - The vCPU thread sends sends the exit syndrome in the vCPU to VMM channel and writes to the eventfd - The VMM thread forwards the syndrome to the UFFD handler via the UDS socket - The UFFD handler populates the page, clears the corresponding bit in the userfault bitmap and sends a reply to Firecracker - The VMM thread receives the reply and updates a vCPU condvar to notify the vCPU that the fault has been resolved - The vCPU resumes execution Note that as a result of this change, an ability to exit the VM gracefully is lost (at least on x86). In the existing implementation, the VMM thread initiated an exit if an event was read from the eventfd, but no VcpuResponse::Exited responses were read for unknown reason. Since the exit_evt eventfd is now also used by vCPUs to notify the VMM thread of the VM exits caused by pagefaults, this situation (an eventfd event, but response in the channel) can occur also because we have read all VcpuResponse::Userfault in response to the previous eventfd event. Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent 455cff7 commit d94cfd7

File tree

6 files changed

+269
-39
lines changed

6 files changed

+269
-39
lines changed

src/vmm/src/builder.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,8 @@ pub fn build_microvm_for_boot(
187187
// Set up Kvm Vm and register memory regions.
188188
// Build custom CPU config if a custom template is provided.
189189
let mut vm = Vm::new(&kvm, secret_free)?;
190-
let (mut vcpus, vcpus_exit_evt) = vm.create_vcpus(vm_resources.machine_config.vcpu_count)?;
190+
let (mut vcpus, vcpus_exit_evt) =
191+
vm.create_vcpus(vm_resources.machine_config.vcpu_count, secret_free)?;
191192

192193
let guest_memfd = match secret_free {
193194
true => Some(
@@ -539,7 +540,7 @@ pub fn build_microvm_from_snapshot(
539540
let mut vm = Vm::new(&kvm, secret_free).map_err(StartMicrovmError::Vm)?;
540541

541542
let (mut vcpus, vcpus_exit_evt) = vm
542-
.create_vcpus(vm_resources.machine_config.vcpu_count)
543+
.create_vcpus(vm_resources.machine_config.vcpu_count, secret_free)
543544
.map_err(StartMicrovmError::Vm)?;
544545

545546
let guest_memfd = match secret_free {
@@ -931,7 +932,7 @@ pub(crate) mod tests {
931932
pub(crate) fn default_vmm() -> Vmm {
932933
let (kvm, mut vm) = setup_vm_with_memory(mib_to_bytes(128));
933934

934-
let (_, vcpus_exit_evt) = vm.create_vcpus(1).unwrap();
935+
let (_, vcpus_exit_evt) = vm.create_vcpus(1, false).unwrap();
935936

936937
Vmm {
937938
events_observer: Some(std::io::stdin()),

src/vmm/src/lib.rs

Lines changed: 148 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ pub mod vstate;
115115
pub mod initrd;
116116

117117
use std::collections::HashMap;
118-
use std::io;
118+
use std::io::{self, Read, Write};
119+
use std::os::fd::RawFd;
119120
use std::os::unix::io::AsRawFd;
120121
use std::os::unix::net::UnixStream;
121122
use std::sync::mpsc::RecvTimeoutError;
@@ -128,6 +129,7 @@ use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEvent
128129
use seccomp::BpfProgram;
129130
use snapshot::Persist;
130131
use userfaultfd::Uffd;
132+
use vm_memory::GuestAddress;
131133
use vmm_sys_util::epoll::EventSet;
132134
use vmm_sys_util::eventfd::EventFd;
133135
use vmm_sys_util::terminal::Terminal;
@@ -142,12 +144,15 @@ use crate::devices::virtio::block::device::Block;
142144
use crate::devices::virtio::net::Net;
143145
use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET};
144146
use crate::logger::{METRICS, MetricsError, error, info, warn};
145-
use crate::persist::{MicrovmState, MicrovmStateError, VmInfo};
147+
use crate::persist::{FaultReply, FaultRequest, MicrovmState, MicrovmStateError, VmInfo};
146148
use crate::rate_limiter::BucketUpdate;
147149
use crate::vmm_config::instance_info::{InstanceInfo, VmState};
148-
use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion};
150+
use crate::vstate::memory::{
151+
GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion,
152+
};
149153
use crate::vstate::vcpu::VcpuState;
150154
pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse};
155+
use crate::vstate::vm::UserfaultData;
151156
pub use crate::vstate::vm::Vm;
152157

153158
/// Shorthand type for the EventManager flavour used by Firecracker.
@@ -708,6 +713,111 @@ impl Vmm {
708713
self.shutdown_exit_code = Some(exit_code);
709714
}
710715

716+
fn process_vcpu_userfault(&mut self, vcpu: u32, userfault_data: UserfaultData) {
717+
let offset = self
718+
.vm
719+
.guest_memory()
720+
.gpa_to_offset(GuestAddress(userfault_data.gpa))
721+
.expect("Failed to convert GPA to offset");
722+
723+
let fault_request = FaultRequest {
724+
vcpu,
725+
offset,
726+
flags: userfault_data.flags,
727+
token: None,
728+
};
729+
let fault_request_json =
730+
serde_json::to_string(&fault_request).expect("Failed to serialize fault request");
731+
732+
let written = self
733+
.uffd_socket
734+
.as_ref()
735+
.expect("Uffd socket is not set")
736+
.write(fault_request_json.as_bytes())
737+
.expect("Failed to write to uffd socket");
738+
739+
if written != fault_request_json.len() {
740+
panic!(
741+
"Failed to write the entire fault request to the uffd socket: expected {}, \
742+
written {}",
743+
fault_request_json.len(),
744+
written
745+
);
746+
}
747+
}
748+
749+
fn active_event_in_uffd_socket(&self, source: RawFd, event_set: EventSet) -> bool {
750+
if let Some(uffd_socket) = &self.uffd_socket {
751+
uffd_socket.as_raw_fd() == source && event_set == EventSet::IN
752+
} else {
753+
false
754+
}
755+
}
756+
757+
fn process_uffd_socket(&mut self) {
758+
const BUFFER_SIZE: usize = 4096;
759+
760+
let stream = self.uffd_socket.as_mut().expect("Uffd socket is not set");
761+
762+
let mut buffer = [0u8; BUFFER_SIZE];
763+
let mut current_pos = 0;
764+
765+
loop {
766+
if current_pos < BUFFER_SIZE {
767+
match stream.read(&mut buffer[current_pos..]) {
768+
Ok(0) => break,
769+
Ok(n) => current_pos += n,
770+
Err(e) if e.kind() == io::ErrorKind::WouldBlock => {
771+
if current_pos == 0 {
772+
break;
773+
}
774+
}
775+
Err(e) => panic!("Read error: {}", e),
776+
}
777+
}
778+
779+
let mut parser = serde_json::Deserializer::from_slice(&buffer[..current_pos])
780+
.into_iter::<FaultReply>();
781+
let mut total_consumed = 0;
782+
let mut needs_more = false;
783+
784+
while let Some(result) = parser.next() {
785+
match result {
786+
Ok(fault_reply) => {
787+
let vcpu = fault_reply.vcpu.expect("vCPU must be set");
788+
789+
self.vcpus_handles
790+
.get(vcpu as usize)
791+
.expect("Invalid vcpu index")
792+
.send_userfault_resolved();
793+
794+
total_consumed = parser.byte_offset();
795+
}
796+
Err(e) if e.is_eof() => {
797+
needs_more = true;
798+
break;
799+
}
800+
Err(e) => {
801+
println!(
802+
"Buffer content: {:?}",
803+
std::str::from_utf8(&buffer[..current_pos])
804+
);
805+
panic!("Invalid JSON: {}", e);
806+
}
807+
}
808+
}
809+
810+
if total_consumed > 0 {
811+
buffer.copy_within(total_consumed..current_pos, 0);
812+
current_pos -= total_consumed;
813+
}
814+
815+
if needs_more {
816+
continue;
817+
}
818+
}
819+
}
820+
711821
/// Gets a reference to kvm-ioctls Vm
712822
#[cfg(feature = "gdb")]
713823
pub fn vm(&self) -> &Vm {
@@ -790,38 +900,55 @@ impl MutEventSubscriber for Vmm {
790900
let event_set = event.event_set();
791901

792902
if source == self.vcpus_exit_evt.as_raw_fd() && event_set == EventSet::IN {
793-
// Exit event handling should never do anything more than call 'self.stop()'.
794903
let _ = self.vcpus_exit_evt.read();
795904

796-
let exit_code = 'exit_code: {
797-
// Query each vcpu for their exit_code.
798-
for handle in &self.vcpus_handles {
799-
// Drain all vcpu responses that are pending from this vcpu until we find an
800-
// exit status.
801-
for response in handle.response_receiver().try_iter() {
802-
if let VcpuResponse::Exited(status) = response {
803-
// It could be that some vcpus exited successfully while others
804-
// errored out. Thus make sure that error exits from one vcpu always
805-
// takes precedence over "ok" exits
905+
let mut pending_userfaults = Vec::with_capacity(self.vcpus_handles.len());
906+
let mut should_exit = false;
907+
let mut final_exit_code = FcExitCode::Ok;
908+
909+
// First pass: collect all responses and determine exit status
910+
for (handle, index) in self.vcpus_handles.iter().zip(0u32..) {
911+
for response in handle.response_receiver().try_iter() {
912+
match response {
913+
VcpuResponse::Exited(status) => {
914+
should_exit = true;
806915
if status != FcExitCode::Ok {
807-
break 'exit_code status;
916+
final_exit_code = status;
808917
}
809918
}
919+
VcpuResponse::Userfault(userfault_data) => {
920+
pending_userfaults.push((index, userfault_data));
921+
}
922+
_ => panic!("Unexpected response from vcpu: {:?}", response),
810923
}
811924
}
925+
}
812926

813-
// No CPUs exited with error status code, report "Ok"
814-
FcExitCode::Ok
815-
};
816-
self.stop(exit_code);
817-
} else {
818-
error!("Spurious EventManager event for handler: Vmm");
927+
// Process any pending userfaults
928+
for (index, userfault_data) in pending_userfaults {
929+
self.process_vcpu_userfault(index, userfault_data);
930+
}
931+
932+
// Stop if we received an exit event
933+
if should_exit {
934+
self.stop(final_exit_code);
935+
}
936+
}
937+
938+
if self.active_event_in_uffd_socket(source, event_set) {
939+
self.process_uffd_socket();
819940
}
820941
}
821942

822943
fn init(&mut self, ops: &mut EventOps) {
823944
if let Err(err) = ops.add(Events::new(&self.vcpus_exit_evt, EventSet::IN)) {
824945
error!("Failed to register vmm exit event: {}", err);
825946
}
947+
948+
if let Some(uffd_socket) = self.uffd_socket.as_ref() {
949+
if let Err(err) = ops.add(Events::new(uffd_socket, EventSet::IN)) {
950+
panic!("Failed to register UFFD socket: {}", err);
951+
}
952+
}
826953
}
827954
}

src/vmm/src/persist.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,10 @@ fn send_uffd_handshake(
590590
let backend_mappings = serde_json::to_string(backend_mappings).unwrap();
591591

592592
let socket = UnixStream::connect(mem_uds_path)?;
593+
socket
594+
.set_nonblocking(true)
595+
.expect("Cannot set non-blocking");
596+
593597
socket.send_with_fds(
594598
&[backend_mappings.as_bytes()],
595599
// In the happy case we can close the fd since the other process has it open and is

0 commit comments

Comments
 (0)