Skip to content

Commit 3885113

Browse files
kalyazinroypat
authored andcommitted
feat(vmm): implement secret-free fault handling protocol
It contains two parts: - external: between the VMM thread and the UFFD handler - internal: between vCPUs and the VMM thread An outline of the workflow: - When a vCPU fault occurs, vCPU exits to userspace - The vCPU thread sends sends the exit syndrome in the vCPU to VMM channel and writes to the eventfd - The VMM thread forwards the syndrome to the UFFD handler via the UDS socket - The UFFD handler populates the page, clears the corresponding bit in the userfault bitmap and sends a reply to Firecracker - The VMM thread receives the reply and updates a vCPU condvar to notify the vCPU that the fault has been resolved - The vCPU resumes execution Note that as a result of this change, an ability to exit the VM gracefully is lost (at least on x86). In the existing implementation, the VMM thread initiated an exit if an event was read from the eventfd, but no VcpuResponse::Exited responses were read for unknown reason. Since the exit_evt eventfd is now also used by vCPUs to notify the VMM thread of the VM exits caused by pagefaults, this situation (an eventfd event, but response in the channel) can occur also because we have read all VcpuResponse::Userfault in response to the previous eventfd event. Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent b950f5b commit 3885113

File tree

6 files changed

+267
-38
lines changed

6 files changed

+267
-38
lines changed

src/vmm/src/builder.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ fn create_vmm_and_vcpus(
163163
// Instantiate ACPI device manager.
164164
let acpi_device_manager = ACPIDeviceManager::new();
165165

166-
let (vcpus, vcpus_exit_evt) = vm.create_vcpus(vcpu_count)?;
166+
let (vcpus, vcpus_exit_evt) = vm.create_vcpus(vcpu_count, secret_free)?;
167167

168168
#[cfg(target_arch = "x86_64")]
169169
let pio_device_manager = {
@@ -1080,7 +1080,7 @@ pub(crate) mod tests {
10801080
)
10811081
.unwrap();
10821082

1083-
let (_, vcpus_exit_evt) = vm.create_vcpus(1).unwrap();
1083+
let (_, vcpus_exit_evt) = vm.create_vcpus(1, false).unwrap();
10841084

10851085
Vmm {
10861086
events_observer: Some(std::io::stdin()),

src/vmm/src/lib.rs

Lines changed: 148 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ pub mod vstate;
115115
pub mod initrd;
116116

117117
use std::collections::HashMap;
118-
use std::io;
118+
use std::io::{self, Read, Write};
119+
use std::os::fd::RawFd;
119120
use std::os::unix::io::AsRawFd;
120121
use std::os::unix::net::UnixStream;
121122
use std::sync::mpsc::RecvTimeoutError;
@@ -128,6 +129,7 @@ use devices::acpi::vmgenid::VmGenIdError;
128129
use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber};
129130
use seccomp::BpfProgram;
130131
use userfaultfd::Uffd;
132+
use vm_memory::GuestAddress;
131133
use vmm_sys_util::epoll::EventSet;
132134
use vmm_sys_util::eventfd::EventFd;
133135
use vmm_sys_util::terminal::Terminal;
@@ -147,13 +149,16 @@ use crate::devices::virtio::block::device::Block;
147149
use crate::devices::virtio::net::Net;
148150
use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET};
149151
use crate::logger::{METRICS, MetricsError, error, info, warn};
150-
use crate::persist::{MicrovmState, MicrovmStateError, VmInfo};
152+
use crate::persist::{FaultReply, FaultRequest, MicrovmState, MicrovmStateError, VmInfo};
151153
use crate::rate_limiter::BucketUpdate;
152154
use crate::snapshot::Persist;
153155
use crate::vmm_config::instance_info::{InstanceInfo, VmState};
154-
use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion};
156+
use crate::vstate::memory::{
157+
GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion,
158+
};
155159
use crate::vstate::vcpu::VcpuState;
156160
pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse};
161+
use crate::vstate::vm::UserfaultData;
157162
pub use crate::vstate::vm::Vm;
158163

159164
/// Shorthand type for the EventManager flavour used by Firecracker.
@@ -800,6 +805,111 @@ impl Vmm {
800805
self.shutdown_exit_code = Some(exit_code);
801806
}
802807

808+
fn process_vcpu_userfault(&mut self, vcpu: u32, userfault_data: UserfaultData) {
809+
let offset = self
810+
.vm
811+
.guest_memory()
812+
.gpa_to_offset(GuestAddress(userfault_data.gpa))
813+
.expect("Failed to convert GPA to offset");
814+
815+
let fault_request = FaultRequest {
816+
vcpu,
817+
offset,
818+
flags: userfault_data.flags,
819+
token: None,
820+
};
821+
let fault_request_json =
822+
serde_json::to_string(&fault_request).expect("Failed to serialize fault request");
823+
824+
let written = self
825+
.uffd_socket
826+
.as_ref()
827+
.expect("Uffd socket is not set")
828+
.write(fault_request_json.as_bytes())
829+
.expect("Failed to write to uffd socket");
830+
831+
if written != fault_request_json.len() {
832+
panic!(
833+
"Failed to write the entire fault request to the uffd socket: expected {}, \
834+
written {}",
835+
fault_request_json.len(),
836+
written
837+
);
838+
}
839+
}
840+
841+
fn active_event_in_uffd_socket(&self, source: RawFd, event_set: EventSet) -> bool {
842+
if let Some(uffd_socket) = &self.uffd_socket {
843+
uffd_socket.as_raw_fd() == source && event_set == EventSet::IN
844+
} else {
845+
false
846+
}
847+
}
848+
849+
fn process_uffd_socket(&mut self) {
850+
const BUFFER_SIZE: usize = 4096;
851+
852+
let stream = self.uffd_socket.as_mut().expect("Uffd socket is not set");
853+
854+
let mut buffer = [0u8; BUFFER_SIZE];
855+
let mut current_pos = 0;
856+
857+
loop {
858+
if current_pos < BUFFER_SIZE {
859+
match stream.read(&mut buffer[current_pos..]) {
860+
Ok(0) => break,
861+
Ok(n) => current_pos += n,
862+
Err(e) if e.kind() == io::ErrorKind::WouldBlock => {
863+
if current_pos == 0 {
864+
break;
865+
}
866+
}
867+
Err(e) => panic!("Read error: {}", e),
868+
}
869+
}
870+
871+
let mut parser = serde_json::Deserializer::from_slice(&buffer[..current_pos])
872+
.into_iter::<FaultReply>();
873+
let mut total_consumed = 0;
874+
let mut needs_more = false;
875+
876+
while let Some(result) = parser.next() {
877+
match result {
878+
Ok(fault_reply) => {
879+
let vcpu = fault_reply.vcpu.expect("vCPU must be set");
880+
881+
self.vcpus_handles
882+
.get(vcpu as usize)
883+
.expect("Invalid vcpu index")
884+
.send_userfault_resolved();
885+
886+
total_consumed = parser.byte_offset();
887+
}
888+
Err(e) if e.is_eof() => {
889+
needs_more = true;
890+
break;
891+
}
892+
Err(e) => {
893+
println!(
894+
"Buffer content: {:?}",
895+
std::str::from_utf8(&buffer[..current_pos])
896+
);
897+
panic!("Invalid JSON: {}", e);
898+
}
899+
}
900+
}
901+
902+
if total_consumed > 0 {
903+
buffer.copy_within(total_consumed..current_pos, 0);
904+
current_pos -= total_consumed;
905+
}
906+
907+
if needs_more {
908+
continue;
909+
}
910+
}
911+
}
912+
803913
/// Gets a reference to kvm-ioctls Vm
804914
#[cfg(feature = "gdb")]
805915
pub fn vm(&self) -> &Vm {
@@ -882,38 +992,55 @@ impl MutEventSubscriber for Vmm {
882992
let event_set = event.event_set();
883993

884994
if source == self.vcpus_exit_evt.as_raw_fd() && event_set == EventSet::IN {
885-
// Exit event handling should never do anything more than call 'self.stop()'.
886995
let _ = self.vcpus_exit_evt.read();
887996

888-
let exit_code = 'exit_code: {
889-
// Query each vcpu for their exit_code.
890-
for handle in &self.vcpus_handles {
891-
// Drain all vcpu responses that are pending from this vcpu until we find an
892-
// exit status.
893-
for response in handle.response_receiver().try_iter() {
894-
if let VcpuResponse::Exited(status) = response {
895-
// It could be that some vcpus exited successfully while others
896-
// errored out. Thus make sure that error exits from one vcpu always
897-
// takes precedence over "ok" exits
997+
let mut pending_userfaults = Vec::with_capacity(self.vcpus_handles.len());
998+
let mut should_exit = false;
999+
let mut final_exit_code = FcExitCode::Ok;
1000+
1001+
// First pass: collect all responses and determine exit status
1002+
for (handle, index) in self.vcpus_handles.iter().zip(0u32..) {
1003+
for response in handle.response_receiver().try_iter() {
1004+
match response {
1005+
VcpuResponse::Exited(status) => {
1006+
should_exit = true;
8981007
if status != FcExitCode::Ok {
899-
break 'exit_code status;
1008+
final_exit_code = status;
9001009
}
9011010
}
1011+
VcpuResponse::Userfault(userfault_data) => {
1012+
pending_userfaults.push((index, userfault_data));
1013+
}
1014+
_ => panic!("Unexpected response from vcpu: {:?}", response),
9021015
}
9031016
}
1017+
}
9041018

905-
// No CPUs exited with error status code, report "Ok"
906-
FcExitCode::Ok
907-
};
908-
self.stop(exit_code);
909-
} else {
910-
error!("Spurious EventManager event for handler: Vmm");
1019+
// Process any pending userfaults
1020+
for (index, userfault_data) in pending_userfaults {
1021+
self.process_vcpu_userfault(index, userfault_data);
1022+
}
1023+
1024+
// Stop if we received an exit event
1025+
if should_exit {
1026+
self.stop(final_exit_code);
1027+
}
1028+
}
1029+
1030+
if self.active_event_in_uffd_socket(source, event_set) {
1031+
self.process_uffd_socket();
9111032
}
9121033
}
9131034

9141035
fn init(&mut self, ops: &mut EventOps) {
9151036
if let Err(err) = ops.add(Events::new(&self.vcpus_exit_evt, EventSet::IN)) {
9161037
error!("Failed to register vmm exit event: {}", err);
9171038
}
1039+
1040+
if let Some(uffd_socket) = self.uffd_socket.as_ref() {
1041+
if let Err(err) = ops.add(Events::new(uffd_socket, EventSet::IN)) {
1042+
panic!("Failed to register UFFD socket: {}", err);
1043+
}
1044+
}
9181045
}
9191046
}

src/vmm/src/persist.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,10 @@ fn send_uffd_handshake(
598598
let backend_mappings = serde_json::to_string(backend_mappings).unwrap();
599599

600600
let socket = UnixStream::connect(mem_uds_path)?;
601+
socket
602+
.set_nonblocking(true)
603+
.expect("Cannot set non-blocking");
604+
601605
socket.send_with_fds(
602606
&[backend_mappings.as_bytes()],
603607
// In the happy case we can close the fd since the other process has it open and is

0 commit comments

Comments
 (0)