Skip to content

Commit 9f98146

Browse files
kalyazinroypat
authored andcommitted
feat(vmm): configure kvm userfault if secret free is enabled
This is needed to instruct the kernel to exit to userspace when a vCPU fault occurs and the corresponding bit in the userfault bitmap is set. The userfault bitmap is allocated in a memfd by Firecracker and sent to the UFFD handler. This also sends 3 fds to the UFFD handler in the handshake: - UFFD (original) - guest_memfd: for the handler to be able to populate guest memory - userfault bitmap memfd: for the handler to be able to disable exits to userspace for the pages that have already been populated Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent c8aab80 commit 9f98146

File tree

5 files changed

+209
-66
lines changed

5 files changed

+209
-66
lines changed

src/vmm/src/builder.rs

Lines changed: 149 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
//! Enables pre-boot setup, instantiation and booting of a Firecracker VMM.
55
66
use std::fmt::Debug;
7-
use std::io;
8-
use std::os::fd::AsFd;
7+
use std::fs::File;
8+
use std::io::{self};
9+
use std::os::fd::{AsFd, AsRawFd};
910
use std::os::unix::fs::MetadataExt;
1011
#[cfg(feature = "gdb")]
1112
use std::sync::mpsc;
@@ -14,14 +15,13 @@ use std::sync::{Arc, Mutex};
1415
use event_manager::SubscriberOps;
1516
use kvm_ioctls::Cap;
1617
use linux_loader::cmdline::Cmdline as LoaderKernelCmdline;
17-
use userfaultfd::Uffd;
1818
use utils::time::TimestampUs;
1919
#[cfg(target_arch = "aarch64")]
2020
use vm_memory::GuestAddress;
2121

2222
#[cfg(target_arch = "aarch64")]
2323
use crate::Vcpu;
24-
use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel};
24+
use crate::arch::{ConfigurationError, configure_system_for_boot, host_page_size, load_kernel};
2525
#[cfg(target_arch = "aarch64")]
2626
use crate::construct_kvm_mpidrs;
2727
use crate::cpu_config::templates::{
@@ -30,6 +30,7 @@ use crate::cpu_config::templates::{
3030
#[cfg(target_arch = "x86_64")]
3131
use crate::device_manager;
3232
use crate::device_manager::pci_mngr::PciManagerError;
33+
use crate::device_manager::persist::ACPIDeviceManagerRestoreError;
3334
use crate::device_manager::{
3435
AttachDeviceError, DeviceManager, DeviceManagerCreateError, DevicePersistError,
3536
DeviceRestoreArgs,
@@ -44,15 +45,19 @@ use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend};
4445
use crate::gdb;
4546
use crate::initrd::{InitrdConfig, InitrdError};
4647
use crate::logger::debug;
47-
use crate::persist::{MicrovmState, MicrovmStateError};
48+
use crate::persist::{
49+
GuestMemoryFromFileError, GuestMemoryFromUffdError, MicrovmState, MicrovmStateError,
50+
guest_memory_from_file, guest_memory_from_uffd,
51+
};
4852
use crate::resources::VmResources;
4953
use crate::seccomp::BpfThreadMap;
5054
use crate::snapshot::Persist;
5155
use crate::utils::u64_to_usize;
5256
use crate::vmm_config::instance_info::InstanceInfo;
5357
use crate::vmm_config::machine_config::MachineConfigError;
58+
use crate::vmm_config::snapshot::{LoadSnapshotParams, MemBackendType};
5459
use crate::vstate::kvm::{Kvm, KvmError};
55-
use crate::vstate::memory::{GuestRegionMmap, MaybeBounce};
60+
use crate::vstate::memory::{MaybeBounce, create_memfd};
5661
#[cfg(target_arch = "aarch64")]
5762
use crate::vstate::resources::ResourceAllocator;
5863
use crate::vstate::vcpu::VcpuError;
@@ -344,6 +349,7 @@ pub fn build_microvm_for_boot(
344349
kvm,
345350
vm,
346351
uffd: None,
352+
uffd_socket: None,
347353
vcpus_handles: Vec::new(),
348354
vcpus_exit_evt,
349355
device_manager,
@@ -416,6 +422,17 @@ pub fn build_and_boot_microvm(
416422
Ok(vmm)
417423
}
418424

425+
/// Sub-Error type for [`build_microvm_from_snapshot`] to contain either
426+
/// [`GuestMemoryFromFileError`] or [`GuestMemoryFromUffdError`] within
427+
/// [`BuildMicrovmFromSnapshotError`].
428+
#[derive(Debug, thiserror::Error, displaydoc::Display)]
429+
pub enum BuildMicrovmFromSnapshotErrorGuestMemoryError {
430+
/// Error creating guest memory from file: {0}
431+
File(#[from] GuestMemoryFromFileError),
432+
/// Error creating guest memory from uffd: {0}
433+
Uffd(#[from] GuestMemoryFromUffdError),
434+
}
435+
419436
/// Error type for [`build_microvm_from_snapshot`].
420437
#[derive(Debug, thiserror::Error, displaydoc::Display)]
421438
pub enum BuildMicrovmFromSnapshotError {
@@ -451,8 +468,55 @@ pub enum BuildMicrovmFromSnapshotError {
451468
SeccompFiltersInternal(#[from] crate::seccomp::InstallationError),
452469
/// Failed to restore devices: {0}
453470
RestoreDevices(#[from] DevicePersistError),
471+
/// Failed to restore ACPI device manager: {0}
472+
ACPIDeviManager(#[from] ACPIDeviceManagerRestoreError),
473+
/// VMGenID update failed: {0}
474+
VMGenIDUpdate(std::io::Error),
475+
/// Internal error while restoring microVM: {0}
476+
Internal(#[from] VmmError),
477+
/// Failed to load guest memory: {0}
478+
GuestMemory(#[from] BuildMicrovmFromSnapshotErrorGuestMemoryError),
479+
/// Userfault bitmap memfd error: {0}
480+
UserfaultBitmapMemfd(#[from] crate::vstate::memory::MemoryError),
454481
}
455482

483+
fn memfd_to_slice(memfd: &mut Option<File>) -> Option<&mut [u8]> {
484+
if let Some(bitmap_file) = memfd {
485+
let len = u64_to_usize(
486+
bitmap_file
487+
.metadata()
488+
.expect("Failed to get metadata")
489+
.len(),
490+
);
491+
492+
// SAFETY: the arguments to mmap cannot cause any memory unsafety in the rust sense
493+
let bitmap_addr = unsafe {
494+
libc::mmap(
495+
std::ptr::null_mut(),
496+
len,
497+
libc::PROT_WRITE,
498+
libc::MAP_SHARED,
499+
bitmap_file.as_raw_fd(),
500+
0,
501+
)
502+
};
503+
504+
if bitmap_addr == libc::MAP_FAILED {
505+
panic!(
506+
"Failed to mmap userfault bitmap file: {}",
507+
std::io::Error::last_os_error()
508+
);
509+
}
510+
511+
// SAFETY: `bitmap_addr` is a valid memory address returned by `mmap`.
512+
Some(unsafe { std::slice::from_raw_parts_mut(bitmap_addr.cast(), len) })
513+
} else {
514+
None
515+
}
516+
}
517+
// TODO: take it from kvm-bindings when userfault support is merged upstream
518+
const KVM_CAP_USERFAULT: u32 = 245;
519+
456520
/// Builds and starts a microVM based on the provided MicrovmState.
457521
///
458522
/// An `Arc` reference of the built `Vmm` is also plugged in the `EventManager`, while another
@@ -462,25 +526,96 @@ pub fn build_microvm_from_snapshot(
462526
instance_info: &InstanceInfo,
463527
event_manager: &mut EventManager,
464528
microvm_state: MicrovmState,
465-
guest_memory: Vec<GuestRegionMmap>,
466-
uffd: Option<Uffd>,
467529
seccomp_filters: &BpfThreadMap,
530+
params: &LoadSnapshotParams,
468531
vm_resources: &mut VmResources,
469532
) -> Result<Arc<Mutex<Vmm>>, BuildMicrovmFromSnapshotError> {
470533
// Build Vmm.
471534
debug!("event_start: build microvm from snapshot");
472535

473-
let kvm = Kvm::new(microvm_state.kvm_state.kvm_cap_modifiers.clone())
474-
.map_err(StartMicrovmError::Kvm)?;
536+
let secret_free = vm_resources.machine_config.secret_free;
537+
let mut kvm_capabilities = microvm_state.kvm_state.kvm_cap_modifiers.clone();
538+
if secret_free {
539+
kvm_capabilities.push(KvmCapability::Add(Cap::GuestMemfd as u32));
540+
kvm_capabilities.push(KvmCapability::Add(KVM_CAP_GUEST_MEMFD_MMAP));
541+
kvm_capabilities.push(KvmCapability::Add(KVM_CAP_GUEST_MEMFD_NO_DIRECT_MAP));
542+
kvm_capabilities.push(KvmCapability::Add(KVM_CAP_USERFAULT));
543+
}
544+
545+
let kvm = Kvm::new(kvm_capabilities).map_err(StartMicrovmError::Kvm)?;
475546
// Set up Kvm Vm and register memory regions.
476547
// Build custom CPU config if a custom template is provided.
477-
let mut vm = Vm::new(&kvm, false).map_err(StartMicrovmError::Vm)?;
548+
let mut vm = Vm::new(&kvm, secret_free).map_err(StartMicrovmError::Vm)?;
478549

479550
let (mut vcpus, vcpus_exit_evt) = vm
480551
.create_vcpus(vm_resources.machine_config.vcpu_count)
481552
.map_err(StartMicrovmError::Vm)?;
482553

483-
vm.register_memory_regions(guest_memory, None)
554+
let guest_memfd = match secret_free {
555+
true => Some(
556+
vm.create_guest_memfd(
557+
vm_resources.memory_size(),
558+
GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_NO_DIRECT_MAP,
559+
)
560+
.map_err(VmmError::Vm)?,
561+
),
562+
false => None,
563+
};
564+
565+
let mut userfault_bitmap_memfd = if secret_free {
566+
let bitmap_size = vm_resources.memory_size() / host_page_size() / u8::BITS as usize;
567+
let bitmap_file = create_memfd(bitmap_size as u64, None)?;
568+
569+
Some(bitmap_file.into_file())
570+
} else {
571+
None
572+
};
573+
574+
let mem_backend_path = &params.mem_backend.backend_path;
575+
let mem_state = &microvm_state.vm_state.memory;
576+
let track_dirty_pages = params.track_dirty_pages;
577+
578+
let (guest_memory, uffd, uffd_socket) = match params.mem_backend.backend_type {
579+
MemBackendType::File => {
580+
if vm_resources.machine_config.huge_pages.is_hugetlbfs() {
581+
return Err(BuildMicrovmFromSnapshotErrorGuestMemoryError::File(
582+
GuestMemoryFromFileError::HugetlbfsSnapshot,
583+
)
584+
.into());
585+
}
586+
(
587+
guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)
588+
.map_err(BuildMicrovmFromSnapshotErrorGuestMemoryError::File)?,
589+
None,
590+
None,
591+
)
592+
}
593+
MemBackendType::Uffd => {
594+
if vm_resources.machine_config.huge_pages.is_hugetlbfs() && guest_memfd.is_some() {
595+
return Err(BuildMicrovmFromSnapshotErrorGuestMemoryError::Uffd(
596+
GuestMemoryFromUffdError::HugetlbfsSnapshot,
597+
)
598+
.into());
599+
}
600+
guest_memory_from_uffd(
601+
mem_backend_path,
602+
mem_state,
603+
track_dirty_pages,
604+
vm_resources.machine_config.huge_pages,
605+
guest_memfd,
606+
userfault_bitmap_memfd.as_ref(),
607+
)
608+
.map_err(BuildMicrovmFromSnapshotErrorGuestMemoryError::Uffd)?
609+
}
610+
};
611+
612+
let mut userfault_bitmap = memfd_to_slice(&mut userfault_bitmap_memfd);
613+
if let Some(ref mut slice) = userfault_bitmap {
614+
// Set all bits so a fault on any page will cause a VM exit
615+
slice.fill(0xffu8);
616+
}
617+
618+
vm.register_memory_regions(guest_memory, userfault_bitmap)
484619
.map_err(StartMicrovmError::Vm)?;
485620

486621
#[cfg(target_arch = "x86_64")]
@@ -544,6 +679,7 @@ pub fn build_microvm_from_snapshot(
544679
kvm,
545680
vm,
546681
uffd,
682+
uffd_socket,
547683
vcpus_handles: Vec::new(),
548684
vcpus_exit_evt,
549685
device_manager,
@@ -811,6 +947,7 @@ pub(crate) mod tests {
811947
kvm,
812948
vm: Arc::new(vm),
813949
uffd: None,
950+
uffd_socket: None,
814951
vcpus_handles: Vec::new(),
815952
vcpus_exit_evt,
816953
device_manager: default_device_manager(),

src/vmm/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ pub mod initrd;
117117
use std::collections::HashMap;
118118
use std::io;
119119
use std::os::unix::io::AsRawFd;
120+
use std::os::unix::net::UnixStream;
120121
use std::sync::mpsc::RecvTimeoutError;
121122
use std::sync::{Arc, Barrier, Mutex};
122123
use std::time::Duration;
@@ -297,6 +298,8 @@ pub struct Vmm {
297298
// Save UFFD in order to keep it open in the Firecracker process, as well.
298299
#[allow(unused)]
299300
uffd: Option<Uffd>,
301+
// Used for userfault communication with the UFFD handler when secret freedom is enabled
302+
uffd_socket: Option<UnixStream>,
300303
vcpus_handles: Vec<VcpuHandle>,
301304
// Used by Vcpus and devices to initiate teardown; Vmm should never write here.
302305
vcpus_exit_evt: EventFd,

0 commit comments

Comments
 (0)