Skip to content

Commit d464f08

Browse files
committed
feat(vmm): implement secret-free fault handling protocol
It contains two parts: - external: between the VMM thread and the UFFD handler - internal: between vCPUs and the VMM thread Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent 84f9323 commit d464f08

File tree

4 files changed

+276
-10
lines changed

4 files changed

+276
-10
lines changed

src/vmm/src/lib.rs

Lines changed: 185 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ pub mod vstate;
115115
pub mod initrd;
116116

117117
use std::collections::HashMap;
118-
use std::io;
118+
use std::io::{self, Read, Write};
119+
use std::os::fd::RawFd;
119120
use std::os::unix::io::AsRawFd;
120121
use std::os::unix::net::UnixStream;
121122
use std::sync::mpsc::RecvTimeoutError;
@@ -128,6 +129,7 @@ use devices::acpi::vmgenid::VmGenIdError;
128129
use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber};
129130
use seccomp::BpfProgram;
130131
use userfaultfd::Uffd;
132+
use vm_memory::GuestAddress;
131133
use vmm_sys_util::epoll::EventSet;
132134
use vmm_sys_util::eventfd::EventFd;
133135
use vmm_sys_util::terminal::Terminal;
@@ -147,15 +149,17 @@ use crate::devices::virtio::block::device::Block;
147149
use crate::devices::virtio::net::Net;
148150
use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET};
149151
use crate::logger::{METRICS, MetricsError, error, info, warn};
150-
use crate::persist::{MicrovmState, MicrovmStateError, VmInfo};
152+
use crate::persist::{FaultReply, FaultRequest, MicrovmState, MicrovmStateError, VmInfo};
151153
use crate::rate_limiter::BucketUpdate;
152154
use crate::snapshot::Persist;
153155
use crate::vmm_config::instance_info::{InstanceInfo, VmState};
154-
use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion};
156+
use crate::vstate::memory::{
157+
GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion,
158+
};
155159
use crate::vstate::vcpu::VcpuState;
156160
pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse};
157-
use crate::vstate::vm::UserfaultChannel;
158161
pub use crate::vstate::vm::Vm;
162+
use crate::vstate::vm::{UserfaultChannel, UserfaultChannelError, UserfaultData};
159163

160164
/// Shorthand type for the EventManager flavour used by Firecracker.
161165
pub type EventManager = BaseEventManager<Arc<Mutex<dyn MutEventSubscriber>>>;
@@ -803,6 +807,158 @@ impl Vmm {
803807
self.shutdown_exit_code = Some(exit_code);
804808
}
805809

810+
fn active_event_in_userfault_channel(
811+
&self,
812+
source: RawFd,
813+
event_set: EventSet,
814+
) -> Option<usize> {
815+
if let Some(userfault_channels) = &self.userfault_channels {
816+
userfault_channels.iter().position(|channel| {
817+
let receiver = &channel.receiver;
818+
source == receiver.as_raw_fd() && event_set == EventSet::IN
819+
})
820+
} else {
821+
None
822+
}
823+
}
824+
825+
fn process_userfault_channels(&mut self, vcpu: usize) {
826+
loop {
827+
match self
828+
.userfault_channels
829+
.as_mut()
830+
.expect("Userfault channels must be set")[vcpu]
831+
.recv()
832+
{
833+
Ok(userfault_data) => {
834+
let offset = self
835+
.vm
836+
.guest_memory()
837+
.gpa_to_offset(GuestAddress(userfault_data.gpa))
838+
.unwrap();
839+
840+
let fault_request = FaultRequest {
841+
vcpu: vcpu as _,
842+
offset,
843+
flags: userfault_data.flags,
844+
token: None,
845+
};
846+
let fault_request_json = serde_json::to_string(&fault_request).unwrap();
847+
848+
self.uffd_socket
849+
.as_ref()
850+
.unwrap()
851+
.write(fault_request_json.as_bytes())
852+
.unwrap();
853+
}
854+
Err(ref e) => match e {
855+
UserfaultChannelError::IO(io_e) if io_e.kind() == io::ErrorKind::WouldBlock => {
856+
break;
857+
}
858+
_ => panic!("Error receiving userfault data: {}", e),
859+
},
860+
}
861+
}
862+
}
863+
864+
fn active_event_in_uffd_socket(&self, source: RawFd, event_set: EventSet) -> bool {
865+
if let Some(uffd_socket) = &self.uffd_socket {
866+
uffd_socket.as_raw_fd() == source && event_set == EventSet::IN
867+
} else {
868+
false
869+
}
870+
}
871+
872+
fn process_uffd_socket(&mut self) {
873+
const BUFFER_SIZE: usize = 4096;
874+
875+
let stream = self.uffd_socket.as_mut().expect("Uffd socket is not set");
876+
877+
let mut buffer = [0u8; BUFFER_SIZE];
878+
let mut current_pos = 0;
879+
let mut exit_loop = false;
880+
881+
loop {
882+
if current_pos < BUFFER_SIZE {
883+
match stream.read(&mut buffer[current_pos..]) {
884+
Ok(0) => break,
885+
Ok(n) => current_pos += n,
886+
Err(e) if e.kind() == io::ErrorKind::WouldBlock => {
887+
if exit_loop {
888+
break;
889+
}
890+
}
891+
Err(e) => panic!("Read error: {}", e),
892+
}
893+
894+
exit_loop = false;
895+
}
896+
897+
let mut parser = serde_json::Deserializer::from_slice(&buffer[..current_pos])
898+
.into_iter::<FaultReply>();
899+
let mut total_consumed = 0;
900+
let mut needs_more = false;
901+
902+
while let Some(result) = parser.next() {
903+
match result {
904+
Ok(fault_reply) => {
905+
let gpa = self
906+
.vm
907+
.common
908+
.guest_memory
909+
.offset_to_gpa(fault_reply.offset)
910+
.expect("Failed to convert offset to GPA");
911+
912+
let userfaultfd_data = UserfaultData {
913+
flags: fault_reply.flags,
914+
gpa: gpa.0,
915+
size: fault_reply.len,
916+
};
917+
918+
let vcpu = fault_reply.vcpu.expect("vCPU must be set");
919+
920+
self.userfault_channels
921+
.as_mut()
922+
.expect("userfault_channels are not set")
923+
.get_mut(vcpu as usize)
924+
.expect("Invalid vcpu index")
925+
.send(userfaultfd_data)
926+
.expect("Failed to send userfault data");
927+
928+
total_consumed = parser.byte_offset();
929+
}
930+
Err(e) if e.is_eof() => {
931+
needs_more = true;
932+
break;
933+
}
934+
Err(e) => {
935+
println!(
936+
"Buffer content: {:?}",
937+
std::str::from_utf8(&buffer[..current_pos])
938+
);
939+
panic!("Invalid JSON: {}", e);
940+
}
941+
}
942+
}
943+
944+
if total_consumed > 0 {
945+
buffer.copy_within(total_consumed..current_pos, 0);
946+
current_pos -= total_consumed;
947+
}
948+
949+
if needs_more {
950+
continue;
951+
}
952+
953+
// We consumed all data in the buffer, but the socket may have remaining unread data so
954+
// we attempt to read from it and exit the loop only if we confirm that nothing is in
955+
// there.
956+
if current_pos == 0 {
957+
exit_loop = true;
958+
}
959+
}
960+
}
961+
806962
/// Gets a reference to kvm-ioctls Vm
807963
#[cfg(feature = "gdb")]
808964
pub fn vm(&self) -> &Vm {
@@ -909,14 +1065,37 @@ impl MutEventSubscriber for Vmm {
9091065
FcExitCode::Ok
9101066
};
9111067
self.stop(exit_code);
912-
} else {
913-
error!("Spurious EventManager event for handler: Vmm");
1068+
}
1069+
1070+
if let Some(vcpu) = self.active_event_in_userfault_channel(source, event_set) {
1071+
self.process_userfault_channels(vcpu);
1072+
}
1073+
1074+
if self.active_event_in_uffd_socket(source, event_set) {
1075+
self.process_uffd_socket();
9141076
}
9151077
}
9161078

9171079
fn init(&mut self, ops: &mut EventOps) {
9181080
if let Err(err) = ops.add(Events::new(&self.vcpus_exit_evt, EventSet::IN)) {
9191081
error!("Failed to register vmm exit event: {}", err);
9201082
}
1083+
1084+
if let Some(uffd_socket) = self.uffd_socket.as_ref() {
1085+
if let Err(err) = ops.add(Events::new(uffd_socket, EventSet::IN)) {
1086+
panic!("Failed to register UFFD socket: {}", err);
1087+
}
1088+
}
1089+
1090+
if let Some(userfault_channels) = self.userfault_channels.as_ref() {
1091+
for cpu_idx in 0..userfault_channels.len() {
1092+
if let Err(err) = ops.add(Events::new(
1093+
&userfault_channels[cpu_idx].receiver,
1094+
EventSet::IN,
1095+
)) {
1096+
panic!("Failed to register userfault events: {}", err);
1097+
}
1098+
}
1099+
}
9211100
}
9221101
}

src/vmm/src/persist.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,10 @@ fn send_uffd_handshake(
647647
let backend_mappings = serde_json::to_string(backend_mappings).unwrap();
648648

649649
let socket = UnixStream::connect(mem_uds_path)?;
650+
socket
651+
.set_nonblocking(true)
652+
.expect("Cannot set non-blocking");
653+
650654
socket.send_with_fds(
651655
&[backend_mappings.as_bytes()],
652656
// In the happy case we can close the fd since the other process has it open and is

src/vmm/src/vstate/vcpu.rs

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,15 @@ use crate::logger::{IncMetric, METRICS};
3131
use crate::seccomp::{BpfProgram, BpfProgramRef};
3232
use crate::utils::signal::{Killable, register_signal_handler, sigrtmin};
3333
use crate::utils::sm::StateMachine;
34-
use crate::vstate::vm::{UserfaultChannel, Vm};
34+
use crate::vstate::vm::{UserfaultChannel, UserfaultData, Vm};
3535

3636
/// Signal number (SIGRTMIN) used to kick Vcpus.
3737
pub const VCPU_RTSIG_OFFSET: i32 = 0;
3838

39+
// TODO: remove when KVM userfault support is merged upstream.
40+
/// VM exit due to a userfault.
41+
const KVM_MEMORY_EXIT_FLAG_USERFAULT: u64 = 1 << 4;
42+
3943
/// Errors associated with the wrappers over KVM ioctls.
4044
#[derive(Debug, thiserror::Error, displaydoc::Display)]
4145
pub enum VcpuError {
@@ -312,6 +316,7 @@ impl Vcpu {
312316
// - the other vCPUs won't ever exit out of `KVM_RUN`, but they won't consume CPU.
313317
// So we pause vCPU0 and send a signal to the emulation thread to stop the VMM.
314318
Ok(VcpuEmulation::Stopped) => return self.exit(FcExitCode::Ok),
319+
Ok(VcpuEmulation::Userfault(_)) => unreachable!(),
315320
// If the emulation requests a pause lets do this
316321
#[cfg(feature = "gdb")]
317322
Ok(VcpuEmulation::Paused) => {
@@ -495,6 +500,26 @@ impl Vcpu {
495500
StateMachine::finish()
496501
}
497502

503+
fn handle_userfault(
504+
&mut self,
505+
userfaultfd_data: UserfaultData,
506+
) -> Result<VcpuEmulation, VcpuError> {
507+
let userfault_channel = self
508+
.userfault_channel
509+
.as_mut()
510+
.expect("userfault channel not set");
511+
512+
userfault_channel
513+
.send(userfaultfd_data)
514+
.expect("Failed to send userfault data");
515+
516+
let _ = userfault_channel
517+
.recv()
518+
.expect("Failed to receive userfault response");
519+
520+
Ok(VcpuEmulation::Handled)
521+
}
522+
498523
/// Runs the vCPU in KVM context and handles the kvm exit reason.
499524
///
500525
/// Returns error or enum specifying whether emulation was handled or interrupted.
@@ -505,7 +530,7 @@ impl Vcpu {
505530
return Ok(VcpuEmulation::Interrupted);
506531
}
507532

508-
match self.kvm_vcpu.fd.run() {
533+
let result = match self.kvm_vcpu.fd.run() {
509534
Err(ref err) if err.errno() == libc::EINTR => {
510535
self.kvm_vcpu.fd.set_kvm_immediate_exit(0);
511536
// Notify that this KVM_RUN was interrupted.
@@ -522,7 +547,14 @@ impl Vcpu {
522547
Ok(VcpuEmulation::Paused)
523548
}
524549
emulation_result => handle_kvm_exit(&mut self.kvm_vcpu.peripherals, emulation_result),
525-
}
550+
};
551+
552+
let userfault_data = match result {
553+
Ok(VcpuEmulation::Userfault(userfault_data)) => userfault_data,
554+
_ => return result,
555+
};
556+
557+
self.handle_userfault(userfault_data)
526558
}
527559
}
528560

@@ -600,6 +632,16 @@ fn handle_kvm_exit(
600632
)))
601633
}
602634
},
635+
VcpuExit::MemoryFault { flags, gpa, size } => {
636+
if flags & KVM_MEMORY_EXIT_FLAG_USERFAULT == 0 {
637+
Err(VcpuError::UnhandledKvmExit(format!(
638+
"flags {:x} gpa {:x} size {:x}",
639+
flags, gpa, size
640+
)))
641+
} else {
642+
Ok(VcpuEmulation::Userfault(UserfaultData { flags, gpa, size }))
643+
}
644+
}
603645
arch_specific_reason => {
604646
// run specific architecture emulation.
605647
peripherals.run_arch_emulation(arch_specific_reason)
@@ -761,6 +803,8 @@ pub enum VcpuEmulation {
761803
Interrupted,
762804
/// Stopped.
763805
Stopped,
806+
/// Userfault
807+
Userfault(UserfaultData),
764808
/// Pause request
765809
#[cfg(feature = "gdb")]
766810
Paused,

0 commit comments

Comments
 (0)