Skip to content

Commit 18edbd0

Browse files
kalyazinroypat
authored andcommitted
feat(vmm): configure kvm userfault if secret free is enabled
This is needed to instruct the kernel to exit to userspace when a vCPU fault occurs and the corresponding bit in the userfault bitmap is set. The userfault bitmap is allocated in a memfd by Firecracker and sent to the UFFD handler. This also sends 3 fds to the UFFD handler in the handshake: - UFFD (original) - guest_memfd: for the handler to be able to populate guest memory - userfault bitmap memfd: for the handler to be able to disable exits to userspace for the pages that have already been populated Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent 15d68d6 commit 18edbd0

File tree

3 files changed

+198
-62
lines changed

3 files changed

+198
-62
lines changed

src/vmm/src/builder.rs

Lines changed: 142 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
//! Enables pre-boot setup, instantiation and booting of a Firecracker VMM.
55
66
use std::fmt::Debug;
7-
use std::io;
8-
use std::os::fd::AsFd;
7+
use std::fs::File;
8+
use std::io::{self};
9+
use std::os::fd::{AsFd, AsRawFd};
910
use std::os::unix::fs::MetadataExt;
1011
#[cfg(feature = "gdb")]
1112
use std::sync::mpsc;
@@ -14,7 +15,6 @@ use std::sync::{Arc, Mutex};
1415
use event_manager::{MutEventSubscriber, SubscriberOps};
1516
use libc::EFD_NONBLOCK;
1617
use linux_loader::cmdline::Cmdline as LoaderKernelCmdline;
17-
use userfaultfd::Uffd;
1818
use utils::time::TimestampUs;
1919
#[cfg(target_arch = "aarch64")]
2020
use vm_memory::GuestAddress;
@@ -23,7 +23,7 @@ use vm_superio::Rtc;
2323
use vm_superio::Serial;
2424
use vmm_sys_util::eventfd::EventFd;
2525

26-
use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel};
26+
use crate::arch::{ConfigurationError, configure_system_for_boot, host_page_size, load_kernel};
2727
#[cfg(target_arch = "aarch64")]
2828
use crate::construct_kvm_mpidrs;
2929
use crate::cpu_config::templates::{
@@ -54,15 +54,19 @@ use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend};
5454
use crate::gdb;
5555
use crate::initrd::{InitrdConfig, InitrdError};
5656
use crate::logger::{debug, error};
57-
use crate::persist::{MicrovmState, MicrovmStateError};
57+
use crate::persist::{
58+
GuestMemoryFromFileError, GuestMemoryFromUffdError, MicrovmState, MicrovmStateError,
59+
guest_memory_from_file, guest_memory_from_uffd,
60+
};
5861
use crate::resources::VmResources;
5962
use crate::seccomp::BpfThreadMap;
6063
use crate::snapshot::Persist;
6164
use crate::utils::u64_to_usize;
6265
use crate::vmm_config::instance_info::InstanceInfo;
6366
use crate::vmm_config::machine_config::MachineConfigError;
67+
use crate::vmm_config::snapshot::{LoadSnapshotParams, MemBackendType};
6468
use crate::vstate::kvm::Kvm;
65-
use crate::vstate::memory::{GuestRegionMmap, MaybeBounce};
69+
use crate::vstate::memory::{MaybeBounce, create_memfd};
6670
use crate::vstate::vcpu::{Vcpu, VcpuError};
6771
use crate::vstate::vm::{KVM_GMEM_NO_DIRECT_MAP, Vm};
6872
use crate::{EventManager, Vmm, VmmError, device_manager};
@@ -188,6 +192,7 @@ fn create_vmm_and_vcpus(
188192
kvm,
189193
vm,
190194
uffd: None,
195+
uffd_socket: None,
191196
vcpus_handles: Vec::new(),
192197
vcpus_exit_evt,
193198
resource_allocator,
@@ -422,6 +427,17 @@ pub fn build_and_boot_microvm(
422427
Ok(vmm)
423428
}
424429

430+
/// Sub-Error type for [`build_microvm_from_snapshot`] to contain either
431+
/// [`GuestMemoryFromFileError`] or [`GuestMemoryFromUffdError`] within
432+
/// [`BuildMicrovmFromSnapshotError`].
433+
#[derive(Debug, thiserror::Error, displaydoc::Display)]
434+
pub enum BuildMicrovmFromSnapshotErrorGuestMemoryError {
435+
/// Error creating guest memory from file: {0}
436+
File(#[from] GuestMemoryFromFileError),
437+
/// Error creating guest memory from uffd: {0}
438+
Uffd(#[from] GuestMemoryFromUffdError),
439+
}
440+
425441
/// Error type for [`build_microvm_from_snapshot`].
426442
#[derive(Debug, thiserror::Error, displaydoc::Display)]
427443
pub enum BuildMicrovmFromSnapshotError {
@@ -459,6 +475,47 @@ pub enum BuildMicrovmFromSnapshotError {
459475
ACPIDeviManager(#[from] ACPIDeviceManagerRestoreError),
460476
/// VMGenID update failed: {0}
461477
VMGenIDUpdate(std::io::Error),
478+
/// Internal error while restoring microVM: {0}
479+
Internal(#[from] VmmError),
480+
/// Failed to load guest memory: {0}
481+
GuestMemory(#[from] BuildMicrovmFromSnapshotErrorGuestMemoryError),
482+
/// Userfault bitmap memfd error: {0}
483+
UserfaultBitmapMemfd(#[from] crate::vstate::memory::MemoryError),
484+
}
485+
486+
fn memfd_to_slice(memfd: &Option<File>) -> Option<&mut [u8]> {
487+
if let Some(bitmap_file) = memfd {
488+
let len = u64_to_usize(
489+
bitmap_file
490+
.metadata()
491+
.expect("Failed to get metadata")
492+
.len(),
493+
);
494+
495+
// SAFETY: the arguments to mmap cannot cause any memory unsafety in the rust sense
496+
let bitmap_addr = unsafe {
497+
libc::mmap(
498+
std::ptr::null_mut(),
499+
len,
500+
libc::PROT_WRITE,
501+
libc::MAP_SHARED,
502+
bitmap_file.as_raw_fd(),
503+
0,
504+
)
505+
};
506+
507+
if bitmap_addr == libc::MAP_FAILED {
508+
panic!(
509+
"Failed to mmap userfault bitmap file: {}",
510+
std::io::Error::last_os_error()
511+
);
512+
}
513+
514+
// SAFETY: `bitmap_addr` is a valid memory address returned by `mmap`.
515+
Some(unsafe { std::slice::from_raw_parts_mut(bitmap_addr.cast(), len) })
516+
} else {
517+
None
518+
}
462519
}
463520

464521
/// Builds and starts a microVM based on the provided MicrovmState.
@@ -470,27 +527,100 @@ pub fn build_microvm_from_snapshot(
470527
instance_info: &InstanceInfo,
471528
event_manager: &mut EventManager,
472529
microvm_state: MicrovmState,
473-
guest_memory: Vec<GuestRegionMmap>,
474-
uffd: Option<Uffd>,
475530
seccomp_filters: &BpfThreadMap,
531+
params: &LoadSnapshotParams,
476532
vm_resources: &mut VmResources,
477533
) -> Result<Arc<Mutex<Vmm>>, BuildMicrovmFromSnapshotError> {
534+
// TODO: take it from kvm-bindings when userfault support is merged upstream
535+
const KVM_CAP_USERFAULT: u32 = 241;
536+
478537
// Build Vmm.
479538
debug!("event_start: build microvm from snapshot");
539+
540+
let secret_free = vm_resources.machine_config.secret_free;
541+
542+
let mut kvm_capabilities = microvm_state.kvm_state.kvm_cap_modifiers.clone();
543+
if secret_free {
544+
kvm_capabilities.push(KvmCapability::Add(KVM_CAP_USERFAULT));
545+
}
546+
480547
let (mut vmm, mut vcpus) = create_vmm_and_vcpus(
481548
instance_info,
482549
event_manager,
483550
vm_resources.machine_config.vcpu_count,
484-
microvm_state.kvm_state.kvm_cap_modifiers.clone(),
485-
false,
551+
kvm_capabilities,
552+
secret_free,
486553
)
487554
.map_err(StartMicrovmError::Internal)?;
488555

556+
let guest_memfd = match secret_free {
557+
true => Some(
558+
vmm.vm
559+
.create_guest_memfd(vm_resources.memory_size(), KVM_GMEM_NO_DIRECT_MAP)
560+
.map_err(VmmError::Vm)?,
561+
),
562+
false => None,
563+
};
564+
565+
let userfault_bitmap_memfd = if secret_free {
566+
let bitmap_size = vm_resources.memory_size() / host_page_size() / u8::BITS as usize;
567+
let bitmap_file = create_memfd(bitmap_size as u64, None)?;
568+
569+
Some(bitmap_file.into_file())
570+
} else {
571+
None
572+
};
573+
574+
let mem_backend_path = &params.mem_backend.backend_path;
575+
let mem_state = &microvm_state.vm_state.memory;
576+
let track_dirty_pages = params.enable_diff_snapshots;
577+
578+
let (guest_memory, uffd, socket) = match params.mem_backend.backend_type {
579+
MemBackendType::File => {
580+
if vm_resources.machine_config.huge_pages.is_hugetlbfs() {
581+
return Err(BuildMicrovmFromSnapshotErrorGuestMemoryError::File(
582+
GuestMemoryFromFileError::HugetlbfsSnapshot,
583+
)
584+
.into());
585+
}
586+
(
587+
guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)
588+
.map_err(BuildMicrovmFromSnapshotErrorGuestMemoryError::File)?,
589+
None,
590+
None,
591+
)
592+
}
593+
MemBackendType::Uffd => {
594+
if vm_resources.machine_config.huge_pages.is_hugetlbfs() && guest_memfd.is_some() {
595+
return Err(BuildMicrovmFromSnapshotErrorGuestMemoryError::Uffd(
596+
GuestMemoryFromUffdError::HugetlbfsSnapshot,
597+
)
598+
.into());
599+
}
600+
guest_memory_from_uffd(
601+
mem_backend_path,
602+
mem_state,
603+
track_dirty_pages,
604+
vm_resources.machine_config.huge_pages,
605+
guest_memfd,
606+
userfault_bitmap_memfd.as_ref(),
607+
)
608+
.map_err(BuildMicrovmFromSnapshotErrorGuestMemoryError::Uffd)?
609+
}
610+
};
611+
612+
let mut userfault_bitmap = memfd_to_slice(&userfault_bitmap_memfd);
613+
if let Some(ref mut slice) = userfault_bitmap {
614+
// Set all bits so a fault on any page will cause a VM exit
615+
slice.fill(0xffu8);
616+
}
617+
489618
vmm.vm
490-
.register_memory_regions(guest_memory, None)
619+
.register_memory_regions(guest_memory, userfault_bitmap)
491620
.map_err(VmmError::Vm)
492621
.map_err(StartMicrovmError::Internal)?;
493622
vmm.uffd = uffd;
623+
vmm.uffd_socket = socket;
494624

495625
#[cfg(target_arch = "x86_64")]
496626
{
@@ -956,6 +1086,7 @@ pub(crate) mod tests {
9561086
kvm,
9571087
vm,
9581088
uffd: None,
1089+
uffd_socket: None,
9591090
vcpus_handles: Vec::new(),
9601091
vcpus_exit_evt,
9611092
resource_allocator: ResourceAllocator::new().unwrap(),

src/vmm/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ pub mod initrd;
117117
use std::collections::HashMap;
118118
use std::io;
119119
use std::os::unix::io::AsRawFd;
120+
use std::os::unix::net::UnixStream;
120121
use std::sync::mpsc::RecvTimeoutError;
121122
use std::sync::{Arc, Barrier, Mutex};
122123
use std::time::Duration;
@@ -310,6 +311,8 @@ pub struct Vmm {
310311
pub vm: Vm,
311312
// Save UFFD in order to keep it open in the Firecracker process, as well.
312313
uffd: Option<Uffd>,
314+
// Used for userfault communication with the UFFD handler when secret freedom is enabled
315+
uffd_socket: Option<UnixStream>,
313316
vcpus_handles: Vec<VcpuHandle>,
314317
// Used by Vcpus and devices to initiate teardown; Vmm should never write here.
315318
vcpus_exit_evt: EventFd,

0 commit comments

Comments
 (0)