Skip to content

Commit 01c02c5

Browse files
kalyazinroypat
authored andcommitted
feat(vmm): configure kvm userfault if secret free is enabled
This is needed to instruct the kernel to exit to userspace when a vCPU fault occurs and the corresponding bit in the userfault bitmap is set. The userfault bitmap is allocated in a memfd by Firecracker and sent to the UFFD handler. This also sends 3 fds to the UFFD handler in the handshake: - UFFD (original) - guest_memfd: for the handler to be able to populate guest memory - userfault bitmap memfd: for the handler to be able to disable exits to userspace for the pages that have already been populated Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent 10fe7f0 commit 01c02c5

File tree

5 files changed

+209
-66
lines changed

5 files changed

+209
-66
lines changed

src/vmm/src/builder.rs

Lines changed: 149 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
//! Enables pre-boot setup, instantiation and booting of a Firecracker VMM.
55
66
use std::fmt::Debug;
7-
use std::io;
8-
use std::os::fd::AsFd;
7+
use std::fs::File;
8+
use std::io::{self};
9+
use std::os::fd::{AsFd, AsRawFd};
910
use std::os::unix::fs::MetadataExt;
1011
#[cfg(feature = "gdb")]
1112
use std::sync::mpsc;
@@ -14,14 +15,13 @@ use std::sync::{Arc, Mutex};
1415
use event_manager::SubscriberOps;
1516
use kvm_ioctls::Cap;
1617
use linux_loader::cmdline::Cmdline as LoaderKernelCmdline;
17-
use userfaultfd::Uffd;
1818
use utils::time::TimestampUs;
1919
#[cfg(target_arch = "aarch64")]
2020
use vm_memory::GuestAddress;
2121

2222
#[cfg(target_arch = "aarch64")]
2323
use crate::Vcpu;
24-
use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel};
24+
use crate::arch::{ConfigurationError, configure_system_for_boot, host_page_size, load_kernel};
2525
#[cfg(target_arch = "aarch64")]
2626
use crate::construct_kvm_mpidrs;
2727
use crate::cpu_config::templates::{
@@ -30,6 +30,7 @@ use crate::cpu_config::templates::{
3030
#[cfg(target_arch = "x86_64")]
3131
use crate::device_manager;
3232
use crate::device_manager::pci_mngr::PciManagerError;
33+
use crate::device_manager::persist::ACPIDeviceManagerRestoreError;
3334
use crate::device_manager::{
3435
AttachDeviceError, DeviceManager, DeviceManagerCreateError, DevicePersistError,
3536
DeviceRestoreArgs,
@@ -44,15 +45,19 @@ use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend};
4445
use crate::gdb;
4546
use crate::initrd::{InitrdConfig, InitrdError};
4647
use crate::logger::debug;
47-
use crate::persist::{MicrovmState, MicrovmStateError};
48+
use crate::persist::{
49+
GuestMemoryFromFileError, GuestMemoryFromUffdError, MicrovmState, MicrovmStateError,
50+
guest_memory_from_file, guest_memory_from_uffd,
51+
};
4852
use crate::resources::VmResources;
4953
use crate::seccomp::BpfThreadMap;
5054
use crate::snapshot::Persist;
5155
use crate::utils::u64_to_usize;
5256
use crate::vmm_config::instance_info::InstanceInfo;
5357
use crate::vmm_config::machine_config::MachineConfigError;
58+
use crate::vmm_config::snapshot::{LoadSnapshotParams, MemBackendType};
5459
use crate::vstate::kvm::{Kvm, KvmError};
55-
use crate::vstate::memory::{GuestRegionMmap, MaybeBounce};
60+
use crate::vstate::memory::{MaybeBounce, create_memfd};
5661
#[cfg(target_arch = "aarch64")]
5762
use crate::vstate::resources::ResourceAllocator;
5863
use crate::vstate::vcpu::VcpuError;
@@ -335,6 +340,7 @@ pub fn build_microvm_for_boot(
335340
kvm,
336341
vm,
337342
uffd: None,
343+
uffd_socket: None,
338344
vcpus_handles: Vec::new(),
339345
vcpus_exit_evt,
340346
device_manager,
@@ -407,6 +413,17 @@ pub fn build_and_boot_microvm(
407413
Ok(vmm)
408414
}
409415

416+
/// Sub-Error type for [`build_microvm_from_snapshot`] to contain either
417+
/// [`GuestMemoryFromFileError`] or [`GuestMemoryFromUffdError`] within
418+
/// [`BuildMicrovmFromSnapshotError`].
419+
#[derive(Debug, thiserror::Error, displaydoc::Display)]
420+
pub enum BuildMicrovmFromSnapshotErrorGuestMemoryError {
421+
/// Error creating guest memory from file: {0}
422+
File(#[from] GuestMemoryFromFileError),
423+
/// Error creating guest memory from uffd: {0}
424+
Uffd(#[from] GuestMemoryFromUffdError),
425+
}
426+
410427
/// Error type for [`build_microvm_from_snapshot`].
411428
#[derive(Debug, thiserror::Error, displaydoc::Display)]
412429
pub enum BuildMicrovmFromSnapshotError {
@@ -442,8 +459,55 @@ pub enum BuildMicrovmFromSnapshotError {
442459
SeccompFiltersInternal(#[from] crate::seccomp::InstallationError),
443460
/// Failed to restore devices: {0}
444461
RestoreDevices(#[from] DevicePersistError),
462+
/// Failed to restore ACPI device manager: {0}
463+
ACPIDeviManager(#[from] ACPIDeviceManagerRestoreError),
464+
/// VMGenID update failed: {0}
465+
VMGenIDUpdate(std::io::Error),
466+
/// Internal error while restoring microVM: {0}
467+
Internal(#[from] VmmError),
468+
/// Failed to load guest memory: {0}
469+
GuestMemory(#[from] BuildMicrovmFromSnapshotErrorGuestMemoryError),
470+
/// Userfault bitmap memfd error: {0}
471+
UserfaultBitmapMemfd(#[from] crate::vstate::memory::MemoryError),
445472
}
446473

474+
fn memfd_to_slice(memfd: &Option<File>) -> Option<&mut [u8]> {
475+
if let Some(bitmap_file) = memfd {
476+
let len = u64_to_usize(
477+
bitmap_file
478+
.metadata()
479+
.expect("Failed to get metadata")
480+
.len(),
481+
);
482+
483+
// SAFETY: the arguments to mmap cannot cause any memory unsafety in the rust sense
484+
let bitmap_addr = unsafe {
485+
libc::mmap(
486+
std::ptr::null_mut(),
487+
len,
488+
libc::PROT_WRITE,
489+
libc::MAP_SHARED,
490+
bitmap_file.as_raw_fd(),
491+
0,
492+
)
493+
};
494+
495+
if bitmap_addr == libc::MAP_FAILED {
496+
panic!(
497+
"Failed to mmap userfault bitmap file: {}",
498+
std::io::Error::last_os_error()
499+
);
500+
}
501+
502+
// SAFETY: `bitmap_addr` is a valid memory address returned by `mmap`.
503+
Some(unsafe { std::slice::from_raw_parts_mut(bitmap_addr.cast(), len) })
504+
} else {
505+
None
506+
}
507+
}
508+
// TODO: take it from kvm-bindings when userfault support is merged upstream
509+
const KVM_CAP_USERFAULT: u32 = 245;
510+
447511
/// Builds and starts a microVM based on the provided MicrovmState.
448512
///
449513
/// An `Arc` reference of the built `Vmm` is also plugged in the `EventManager`, while another
@@ -453,25 +517,96 @@ pub fn build_microvm_from_snapshot(
453517
instance_info: &InstanceInfo,
454518
event_manager: &mut EventManager,
455519
microvm_state: MicrovmState,
456-
guest_memory: Vec<GuestRegionMmap>,
457-
uffd: Option<Uffd>,
458520
seccomp_filters: &BpfThreadMap,
521+
params: &LoadSnapshotParams,
459522
vm_resources: &mut VmResources,
460523
) -> Result<Arc<Mutex<Vmm>>, BuildMicrovmFromSnapshotError> {
461524
// Build Vmm.
462525
debug!("event_start: build microvm from snapshot");
463526

464-
let kvm = Kvm::new(microvm_state.kvm_state.kvm_cap_modifiers.clone())
465-
.map_err(StartMicrovmError::Kvm)?;
527+
let secret_free = vm_resources.machine_config.secret_free;
528+
let mut kvm_capabilities = microvm_state.kvm_state.kvm_cap_modifiers.clone();
529+
if secret_free {
530+
kvm_capabilities.push(KvmCapability::Add(Cap::GuestMemfd as u32));
531+
kvm_capabilities.push(KvmCapability::Add(KVM_CAP_GUEST_MEMFD_MMAP));
532+
kvm_capabilities.push(KvmCapability::Add(KVM_CAP_GUEST_MEMFD_NO_DIRECT_MAP));
533+
kvm_capabilities.push(KvmCapability::Add(KVM_CAP_USERFAULT));
534+
}
535+
536+
let kvm = Kvm::new(kvm_capabilities).map_err(StartMicrovmError::Kvm)?;
466537
// Set up Kvm Vm and register memory regions.
467538
// Build custom CPU config if a custom template is provided.
468-
let mut vm = Vm::new(&kvm, false).map_err(StartMicrovmError::Vm)?;
539+
let mut vm = Vm::new(&kvm, secret_free).map_err(StartMicrovmError::Vm)?;
469540

470541
let (mut vcpus, vcpus_exit_evt) = vm
471542
.create_vcpus(vm_resources.machine_config.vcpu_count)
472543
.map_err(StartMicrovmError::Vm)?;
473544

474-
vm.register_memory_regions(guest_memory, None)
545+
let guest_memfd = match secret_free {
546+
true => Some(
547+
vm.create_guest_memfd(
548+
vm_resources.memory_size(),
549+
GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_NO_DIRECT_MAP,
550+
)
551+
.map_err(VmmError::Vm)?,
552+
),
553+
false => None,
554+
};
555+
556+
let userfault_bitmap_memfd = if secret_free {
557+
let bitmap_size = vm_resources.memory_size() / host_page_size() / u8::BITS as usize;
558+
let bitmap_file = create_memfd(bitmap_size as u64, None)?;
559+
560+
Some(bitmap_file.into_file())
561+
} else {
562+
None
563+
};
564+
565+
let mem_backend_path = &params.mem_backend.backend_path;
566+
let mem_state = &microvm_state.vm_state.memory;
567+
let track_dirty_pages = params.track_dirty_pages;
568+
569+
let (guest_memory, uffd, uffd_socket) = match params.mem_backend.backend_type {
570+
MemBackendType::File => {
571+
if vm_resources.machine_config.huge_pages.is_hugetlbfs() {
572+
return Err(BuildMicrovmFromSnapshotErrorGuestMemoryError::File(
573+
GuestMemoryFromFileError::HugetlbfsSnapshot,
574+
)
575+
.into());
576+
}
577+
(
578+
guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)
579+
.map_err(BuildMicrovmFromSnapshotErrorGuestMemoryError::File)?,
580+
None,
581+
None,
582+
)
583+
}
584+
MemBackendType::Uffd => {
585+
if vm_resources.machine_config.huge_pages.is_hugetlbfs() && guest_memfd.is_some() {
586+
return Err(BuildMicrovmFromSnapshotErrorGuestMemoryError::Uffd(
587+
GuestMemoryFromUffdError::HugetlbfsSnapshot,
588+
)
589+
.into());
590+
}
591+
guest_memory_from_uffd(
592+
mem_backend_path,
593+
mem_state,
594+
track_dirty_pages,
595+
vm_resources.machine_config.huge_pages,
596+
guest_memfd,
597+
userfault_bitmap_memfd.as_ref(),
598+
)
599+
.map_err(BuildMicrovmFromSnapshotErrorGuestMemoryError::Uffd)?
600+
}
601+
};
602+
603+
let mut userfault_bitmap = memfd_to_slice(&userfault_bitmap_memfd);
604+
if let Some(ref mut slice) = userfault_bitmap {
605+
// Set all bits so a fault on any page will cause a VM exit
606+
slice.fill(0xffu8);
607+
}
608+
609+
vm.register_memory_regions(guest_memory, userfault_bitmap)
475610
.map_err(StartMicrovmError::Vm)?;
476611

477612
#[cfg(target_arch = "x86_64")]
@@ -536,6 +671,7 @@ pub fn build_microvm_from_snapshot(
536671
kvm,
537672
vm,
538673
uffd,
674+
uffd_socket,
539675
vcpus_handles: Vec::new(),
540676
vcpus_exit_evt,
541677
device_manager,
@@ -804,6 +940,7 @@ pub(crate) mod tests {
804940
kvm,
805941
vm: Arc::new(vm),
806942
uffd: None,
943+
uffd_socket: None,
807944
vcpus_handles: Vec::new(),
808945
vcpus_exit_evt,
809946
device_manager: default_device_manager(),

src/vmm/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ pub mod initrd;
117117
use std::collections::HashMap;
118118
use std::io;
119119
use std::os::unix::io::AsRawFd;
120+
use std::os::unix::net::UnixStream;
120121
use std::sync::mpsc::RecvTimeoutError;
121122
use std::sync::{Arc, Barrier, Mutex};
122123
use std::time::Duration;
@@ -305,6 +306,8 @@ pub struct Vmm {
305306
// Save UFFD in order to keep it open in the Firecracker process, as well.
306307
#[allow(unused)]
307308
uffd: Option<Uffd>,
309+
// Used for userfault communication with the UFFD handler when secret freedom is enabled
310+
uffd_socket: Option<UnixStream>,
308311
vcpus_handles: Vec<VcpuHandle>,
309312
// Used by Vcpus and devices to initiate teardown; Vmm should never write here.
310313
vcpus_exit_evt: EventFd,

0 commit comments

Comments
 (0)