Skip to content

Commit 83c6309

Browse files
committed
feat(vmm): implement secret-free fault handling protocol
It contains two parts: - external: between the VMM thread and the UFFD handler - internal: between vCPUs and the VMM thread An outline of the workflow: - When a vCPU fault occurs, vCPU exits to userspace - The vCPU thread sends a message to the VMM thread via the userfault channel - The VMM thread forwards the message to the UFFD handler via the UDS socket - The UFFD hnadler populates the page, clears the corresponding bit in the userfault bitmap and sends a reply to Firecracker - The VMM thread receives the reply and forwards it to the vCPU via the userfault channel - The vCPU resumes execution Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent 401417c commit 83c6309

File tree

4 files changed

+283
-10
lines changed

4 files changed

+283
-10
lines changed

src/vmm/src/lib.rs

Lines changed: 192 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ pub mod vstate;
115115
pub mod initrd;
116116

117117
use std::collections::HashMap;
118-
use std::io;
118+
use std::io::{self, Read, Write};
119+
use std::os::fd::RawFd;
119120
use std::os::unix::io::AsRawFd;
120121
use std::os::unix::net::UnixStream;
121122
use std::sync::mpsc::RecvTimeoutError;
@@ -128,6 +129,7 @@ use devices::acpi::vmgenid::VmGenIdError;
128129
use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber};
129130
use seccomp::BpfProgram;
130131
use userfaultfd::Uffd;
132+
use vm_memory::GuestAddress;
131133
use vmm_sys_util::epoll::EventSet;
132134
use vmm_sys_util::eventfd::EventFd;
133135
use vmm_sys_util::terminal::Terminal;
@@ -147,15 +149,17 @@ use crate::devices::virtio::block::device::Block;
147149
use crate::devices::virtio::net::Net;
148150
use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET};
149151
use crate::logger::{METRICS, MetricsError, error, info, warn};
150-
use crate::persist::{MicrovmState, MicrovmStateError, VmInfo};
152+
use crate::persist::{FaultReply, FaultRequest, MicrovmState, MicrovmStateError, VmInfo};
151153
use crate::rate_limiter::BucketUpdate;
152154
use crate::snapshot::Persist;
153155
use crate::vmm_config::instance_info::{InstanceInfo, VmState};
154-
use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion};
156+
use crate::vstate::memory::{
157+
GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion,
158+
};
155159
use crate::vstate::vcpu::VcpuState;
156160
pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse};
157-
use crate::vstate::vm::UserfaultChannel;
158161
pub use crate::vstate::vm::Vm;
162+
use crate::vstate::vm::{UserfaultChannel, UserfaultChannelError, UserfaultData};
159163

160164
/// Shorthand type for the EventManager flavour used by Firecracker.
161165
pub type EventManager = BaseEventManager<Arc<Mutex<dyn MutEventSubscriber>>>;
@@ -803,6 +807,168 @@ impl Vmm {
803807
self.shutdown_exit_code = Some(exit_code);
804808
}
805809

810+
fn active_event_in_userfault_channel(
811+
&self,
812+
source: RawFd,
813+
event_set: EventSet,
814+
) -> Option<usize> {
815+
if let Some(userfault_channels) = &self.userfault_channels {
816+
userfault_channels.iter().position(|channel| {
817+
let receiver = &channel.receiver;
818+
source == receiver.as_raw_fd() && event_set == EventSet::IN
819+
})
820+
} else {
821+
None
822+
}
823+
}
824+
825+
fn process_userfault_channels(&mut self, vcpu: usize) {
826+
loop {
827+
match self
828+
.userfault_channels
829+
.as_mut()
830+
.expect("Userfault channels must be set")[vcpu]
831+
.recv()
832+
{
833+
Ok(userfault_data) => {
834+
let offset = self
835+
.vm
836+
.guest_memory()
837+
.gpa_to_offset(GuestAddress(userfault_data.gpa))
838+
.unwrap();
839+
840+
let fault_request = FaultRequest {
841+
vcpu: vcpu.try_into().expect("Invalid vCPU index"),
842+
offset,
843+
flags: userfault_data.flags,
844+
token: None,
845+
};
846+
let fault_request_json = serde_json::to_string(&fault_request).unwrap();
847+
848+
let written = self
849+
.uffd_socket
850+
.as_ref()
851+
.unwrap()
852+
.write(fault_request_json.as_bytes())
853+
.unwrap();
854+
855+
if written != fault_request_json.len() {
856+
panic!(
857+
"Failed to write the entire fault request to the uffd socket: \
858+
expected {}, written {}",
859+
fault_request_json.len(),
860+
written
861+
);
862+
}
863+
}
864+
Err(ref e) => match e {
865+
UserfaultChannelError::IO(io_e) if io_e.kind() == io::ErrorKind::WouldBlock => {
866+
break;
867+
}
868+
_ => panic!("Error receiving userfault data: {}", e),
869+
},
870+
}
871+
}
872+
}
873+
874+
fn active_event_in_uffd_socket(&self, source: RawFd, event_set: EventSet) -> bool {
875+
if let Some(uffd_socket) = &self.uffd_socket {
876+
uffd_socket.as_raw_fd() == source && event_set == EventSet::IN
877+
} else {
878+
false
879+
}
880+
}
881+
882+
fn process_uffd_socket(&mut self) {
883+
const BUFFER_SIZE: usize = 4096;
884+
885+
let stream = self.uffd_socket.as_mut().expect("Uffd socket is not set");
886+
887+
let mut buffer = [0u8; BUFFER_SIZE];
888+
let mut current_pos = 0;
889+
let mut exit_loop = false;
890+
891+
loop {
892+
if current_pos < BUFFER_SIZE {
893+
match stream.read(&mut buffer[current_pos..]) {
894+
Ok(0) => break,
895+
Ok(n) => current_pos += n,
896+
Err(e) if e.kind() == io::ErrorKind::WouldBlock => {
897+
if exit_loop {
898+
break;
899+
}
900+
}
901+
Err(e) => panic!("Read error: {}", e),
902+
}
903+
904+
exit_loop = false;
905+
}
906+
907+
let mut parser = serde_json::Deserializer::from_slice(&buffer[..current_pos])
908+
.into_iter::<FaultReply>();
909+
let mut total_consumed = 0;
910+
let mut needs_more = false;
911+
912+
while let Some(result) = parser.next() {
913+
match result {
914+
Ok(fault_reply) => {
915+
let gpa = self
916+
.vm
917+
.common
918+
.guest_memory
919+
.offset_to_gpa(fault_reply.offset)
920+
.expect("Failed to convert offset to GPA");
921+
922+
let userfaultfd_data = UserfaultData {
923+
flags: fault_reply.flags,
924+
gpa: gpa.0,
925+
size: fault_reply.len,
926+
};
927+
928+
let vcpu = fault_reply.vcpu.expect("vCPU must be set");
929+
930+
self.userfault_channels
931+
.as_mut()
932+
.expect("userfault_channels are not set")
933+
.get_mut(vcpu as usize)
934+
.expect("Invalid vcpu index")
935+
.send(userfaultfd_data)
936+
.expect("Failed to send userfault data");
937+
938+
total_consumed = parser.byte_offset();
939+
}
940+
Err(e) if e.is_eof() => {
941+
needs_more = true;
942+
break;
943+
}
944+
Err(e) => {
945+
println!(
946+
"Buffer content: {:?}",
947+
std::str::from_utf8(&buffer[..current_pos])
948+
);
949+
panic!("Invalid JSON: {}", e);
950+
}
951+
}
952+
}
953+
954+
if total_consumed > 0 {
955+
buffer.copy_within(total_consumed..current_pos, 0);
956+
current_pos -= total_consumed;
957+
}
958+
959+
if needs_more {
960+
continue;
961+
}
962+
963+
// We consumed all data in the buffer, but the socket may have remaining unread data so
964+
// we attempt to read from it and exit the loop only if we confirm that nothing is in
965+
// there.
966+
if current_pos == 0 {
967+
exit_loop = true;
968+
}
969+
}
970+
}
971+
806972
/// Gets a reference to kvm-ioctls Vm
807973
#[cfg(feature = "gdb")]
808974
pub fn vm(&self) -> &Vm {
@@ -909,14 +1075,34 @@ impl MutEventSubscriber for Vmm {
9091075
FcExitCode::Ok
9101076
};
9111077
self.stop(exit_code);
912-
} else {
913-
error!("Spurious EventManager event for handler: Vmm");
1078+
}
1079+
1080+
if let Some(vcpu) = self.active_event_in_userfault_channel(source, event_set) {
1081+
self.process_userfault_channels(vcpu);
1082+
}
1083+
1084+
if self.active_event_in_uffd_socket(source, event_set) {
1085+
self.process_uffd_socket();
9141086
}
9151087
}
9161088

9171089
fn init(&mut self, ops: &mut EventOps) {
9181090
if let Err(err) = ops.add(Events::new(&self.vcpus_exit_evt, EventSet::IN)) {
9191091
error!("Failed to register vmm exit event: {}", err);
9201092
}
1093+
1094+
if let Some(uffd_socket) = self.uffd_socket.as_ref() {
1095+
if let Err(err) = ops.add(Events::new(uffd_socket, EventSet::IN)) {
1096+
panic!("Failed to register UFFD socket: {}", err);
1097+
}
1098+
}
1099+
1100+
if let Some(userfault_channels) = self.userfault_channels.as_ref() {
1101+
for channel in userfault_channels {
1102+
if let Err(err) = ops.add(Events::new(&channel.receiver, EventSet::IN)) {
1103+
panic!("Failed to register userfault events: {}", err);
1104+
}
1105+
}
1106+
}
9211107
}
9221108
}

src/vmm/src/persist.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,10 @@ fn send_uffd_handshake(
652652
let backend_mappings = serde_json::to_string(backend_mappings).unwrap();
653653

654654
let socket = UnixStream::connect(mem_uds_path)?;
655+
socket
656+
.set_nonblocking(true)
657+
.expect("Cannot set non-blocking");
658+
655659
socket.send_with_fds(
656660
&[backend_mappings.as_bytes()],
657661
// In the happy case we can close the fd since the other process has it open and is

src/vmm/src/vstate/vcpu.rs

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,15 @@ use crate::logger::{IncMetric, METRICS};
3131
use crate::seccomp::{BpfProgram, BpfProgramRef};
3232
use crate::utils::signal::{Killable, register_signal_handler, sigrtmin};
3333
use crate::utils::sm::StateMachine;
34-
use crate::vstate::vm::{UserfaultChannel, Vm};
34+
use crate::vstate::vm::{UserfaultChannel, UserfaultData, Vm};
3535

3636
/// Signal number (SIGRTMIN) used to kick Vcpus.
3737
pub const VCPU_RTSIG_OFFSET: i32 = 0;
3838

39+
// TODO: remove when KVM userfault support is merged upstream.
40+
/// VM exit due to a userfault.
41+
const KVM_MEMORY_EXIT_FLAG_USERFAULT: u64 = 1 << 4;
42+
3943
/// Errors associated with the wrappers over KVM ioctls.
4044
#[derive(Debug, thiserror::Error, displaydoc::Display)]
4145
pub enum VcpuError {
@@ -312,6 +316,7 @@ impl Vcpu {
312316
// - the other vCPUs won't ever exit out of `KVM_RUN`, but they won't consume CPU.
313317
// So we pause vCPU0 and send a signal to the emulation thread to stop the VMM.
314318
Ok(VcpuEmulation::Stopped) => return self.exit(FcExitCode::Ok),
319+
Ok(VcpuEmulation::Userfault(_)) => unreachable!(),
315320
// If the emulation requests a pause lets do this
316321
#[cfg(feature = "gdb")]
317322
Ok(VcpuEmulation::Paused) => {
@@ -495,6 +500,26 @@ impl Vcpu {
495500
StateMachine::finish()
496501
}
497502

503+
fn handle_userfault(
504+
&mut self,
505+
userfaultfd_data: UserfaultData,
506+
) -> Result<VcpuEmulation, VcpuError> {
507+
let userfault_channel = self
508+
.userfault_channel
509+
.as_mut()
510+
.expect("userfault channel not set");
511+
512+
userfault_channel
513+
.send(userfaultfd_data)
514+
.expect("Failed to send userfault data");
515+
516+
let _ = userfault_channel
517+
.recv()
518+
.expect("Failed to receive userfault response");
519+
520+
Ok(VcpuEmulation::Handled)
521+
}
522+
498523
/// Runs the vCPU in KVM context and handles the kvm exit reason.
499524
///
500525
/// Returns error or enum specifying whether emulation was handled or interrupted.
@@ -505,7 +530,7 @@ impl Vcpu {
505530
return Ok(VcpuEmulation::Interrupted);
506531
}
507532

508-
match self.kvm_vcpu.fd.run() {
533+
let result = match self.kvm_vcpu.fd.run() {
509534
Err(ref err) if err.errno() == libc::EINTR => {
510535
self.kvm_vcpu.fd.set_kvm_immediate_exit(0);
511536
// Notify that this KVM_RUN was interrupted.
@@ -522,7 +547,14 @@ impl Vcpu {
522547
Ok(VcpuEmulation::Paused)
523548
}
524549
emulation_result => handle_kvm_exit(&mut self.kvm_vcpu.peripherals, emulation_result),
525-
}
550+
};
551+
552+
let userfault_data = match result {
553+
Ok(VcpuEmulation::Userfault(userfault_data)) => userfault_data,
554+
_ => return result,
555+
};
556+
557+
self.handle_userfault(userfault_data)
526558
}
527559
}
528560

@@ -600,6 +632,16 @@ fn handle_kvm_exit(
600632
)))
601633
}
602634
},
635+
VcpuExit::MemoryFault { flags, gpa, size } => {
636+
if flags & KVM_MEMORY_EXIT_FLAG_USERFAULT == 0 {
637+
Err(VcpuError::UnhandledKvmExit(format!(
638+
"flags {:x} gpa {:x} size {:x}",
639+
flags, gpa, size
640+
)))
641+
} else {
642+
Ok(VcpuEmulation::Userfault(UserfaultData { flags, gpa, size }))
643+
}
644+
}
603645
arch_specific_reason => {
604646
// run specific architecture emulation.
605647
peripherals.run_arch_emulation(arch_specific_reason)
@@ -761,6 +803,8 @@ pub enum VcpuEmulation {
761803
Interrupted,
762804
/// Stopped.
763805
Stopped,
806+
/// Userfault
807+
Userfault(UserfaultData),
764808
/// Pause request
765809
#[cfg(feature = "gdb")]
766810
Paused,

0 commit comments

Comments
 (0)