Skip to content

Commit f405192

Browse files
committed
feat(vmm): configure kvm userfault if secret free is enabled
This us needed to instruct the kernel to exit to userspace when a vCPU fault occurs and the corresponding bit in the userfault bitmap is set. This also sends 3 fds to the UFFD handler in the handshake: - UFFD (original) - guest_memfd: for the handler to be able to populate guest memory - userfault bitmap memfd: for the handler to be able to disable exits to userspace for the pages that have already been populated Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent 9fdd5d2 commit f405192

File tree

3 files changed

+204
-62
lines changed

3 files changed

+204
-62
lines changed

src/vmm/src/builder.rs

Lines changed: 98 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
//! Enables pre-boot setup, instantiation and booting of a Firecracker VMM.
55
66
use std::fmt::Debug;
7-
use std::io;
7+
use std::io::{self, Write};
88
use std::os::fd::AsFd;
99
use std::os::unix::fs::MetadataExt;
1010
#[cfg(feature = "gdb")]
@@ -14,7 +14,6 @@ use std::sync::{Arc, Mutex};
1414
use event_manager::{MutEventSubscriber, SubscriberOps};
1515
use libc::EFD_NONBLOCK;
1616
use linux_loader::cmdline::Cmdline as LoaderKernelCmdline;
17-
use userfaultfd::Uffd;
1817
use utils::time::TimestampUs;
1918
#[cfg(target_arch = "aarch64")]
2019
use vm_memory::GuestAddress;
@@ -23,7 +22,7 @@ use vm_superio::Rtc;
2322
use vm_superio::Serial;
2423
use vmm_sys_util::eventfd::EventFd;
2524

26-
use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel};
25+
use crate::arch::{ConfigurationError, configure_system_for_boot, host_page_size, load_kernel};
2726
#[cfg(target_arch = "aarch64")]
2827
use crate::construct_kvm_mpidrs;
2928
use crate::cpu_config::templates::{
@@ -54,15 +53,19 @@ use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend};
5453
use crate::gdb;
5554
use crate::initrd::{InitrdConfig, InitrdError};
5655
use crate::logger::{debug, error};
57-
use crate::persist::{MicrovmState, MicrovmStateError};
56+
use crate::persist::{
57+
GuestMemoryFromFileError, GuestMemoryFromUffdError, MicrovmState, MicrovmStateError,
58+
guest_memory_from_file, guest_memory_from_uffd,
59+
};
5860
use crate::resources::VmResources;
5961
use crate::seccomp::BpfThreadMap;
6062
use crate::snapshot::Persist;
6163
use crate::utils::u64_to_usize;
6264
use crate::vmm_config::instance_info::InstanceInfo;
6365
use crate::vmm_config::machine_config::MachineConfigError;
66+
use crate::vmm_config::snapshot::{LoadSnapshotParams, MemBackendType};
6467
use crate::vstate::kvm::Kvm;
65-
use crate::vstate::memory::{GuestRegionMmap, MaybeBounce};
68+
use crate::vstate::memory::{MaybeBounce, create_memfd};
6669
use crate::vstate::vcpu::{Vcpu, VcpuError};
6770
use crate::vstate::vm::{KVM_GMEM_NO_DIRECT_MAP, Vm};
6871
use crate::{EventManager, Vmm, VmmError, device_manager};
@@ -188,6 +191,7 @@ fn create_vmm_and_vcpus(
188191
kvm,
189192
vm,
190193
uffd: None,
194+
uffd_socket: None,
191195
vcpus_handles: Vec::new(),
192196
vcpus_exit_evt,
193197
resource_allocator,
@@ -422,6 +426,17 @@ pub fn build_and_boot_microvm(
422426
Ok(vmm)
423427
}
424428

429+
/// Sub-Error type for [`build_microvm_from_snapshot`] to contain either
430+
/// [`GuestMemoryFromFileError`] or [`GuestMemoryFromUffdError`] within
431+
/// [`BuildMicrovmFromSnapshotError`].
432+
#[derive(Debug, thiserror::Error, displaydoc::Display)]
433+
pub enum BuildMicrovmFromSnapshotErrorGuestMemoryError {
434+
/// Error creating guest memory from file: {0}
435+
File(#[from] GuestMemoryFromFileError),
436+
/// Error creating guest memory from uffd: {0}
437+
Uffd(#[from] GuestMemoryFromUffdError),
438+
}
439+
425440
/// Error type for [`build_microvm_from_snapshot`].
426441
#[derive(Debug, thiserror::Error, displaydoc::Display)]
427442
pub enum BuildMicrovmFromSnapshotError {
@@ -459,6 +474,12 @@ pub enum BuildMicrovmFromSnapshotError {
459474
ACPIDeviManager(#[from] ACPIDeviceManagerRestoreError),
460475
/// VMGenID update failed: {0}
461476
VMGenIDUpdate(std::io::Error),
477+
/// Internal error while restoring microVM: {0}
478+
Internal(#[from] VmmError),
479+
/// Failed to load guest memory: {0}
480+
GuestMemory(#[from] BuildMicrovmFromSnapshotErrorGuestMemoryError),
481+
/// Userfault bitmap memfd error: {0}
482+
UserfaultBitmapMemfd(#[from] crate::vstate::memory::MemoryError),
462483
}
463484

464485
/// Builds and starts a microVM based on the provided MicrovmState.
@@ -470,27 +491,93 @@ pub fn build_microvm_from_snapshot(
470491
instance_info: &InstanceInfo,
471492
event_manager: &mut EventManager,
472493
microvm_state: MicrovmState,
473-
guest_memory: Vec<GuestRegionMmap>,
474-
uffd: Option<Uffd>,
475494
seccomp_filters: &BpfThreadMap,
495+
params: &LoadSnapshotParams,
476496
vm_resources: &mut VmResources,
477497
) -> Result<Arc<Mutex<Vmm>>, BuildMicrovmFromSnapshotError> {
498+
// TODO: take it from kvm-bindings when userfault support is merged upstream
499+
const KVM_CAP_USERFAULT: u32 = 241;
500+
478501
// Build Vmm.
479502
debug!("event_start: build microvm from snapshot");
503+
504+
let secret_free = vm_resources.machine_config.secret_free;
505+
506+
let mut kvm_capabilities = microvm_state.kvm_state.kvm_cap_modifiers.clone();
507+
if secret_free {
508+
kvm_capabilities.push(KvmCapability::Add(KVM_CAP_USERFAULT));
509+
}
510+
480511
let (mut vmm, mut vcpus) = create_vmm_and_vcpus(
481512
instance_info,
482513
event_manager,
483514
vm_resources.machine_config.vcpu_count,
484-
microvm_state.kvm_state.kvm_cap_modifiers.clone(),
485-
false,
515+
kvm_capabilities,
516+
secret_free,
486517
)
487518
.map_err(StartMicrovmError::Internal)?;
488519

520+
let guest_memfd = match secret_free {
521+
true => Some(
522+
vmm.vm
523+
.create_guest_memfd(vm_resources.memory_size(), KVM_GMEM_NO_DIRECT_MAP)
524+
.map_err(VmmError::Vm)?,
525+
),
526+
false => None,
527+
};
528+
529+
let userfault_bitmap_memfd = if secret_free {
530+
let bitmap_size = vm_resources.memory_size() / host_page_size();
531+
let bitmap_file = create_memfd(bitmap_size as u64, None)?;
532+
533+
// Set all bits so a fault on any page will cause a VM exit
534+
let all_set = vec![0xffu8; bitmap_size];
535+
bitmap_file.as_file().write_all(&all_set).unwrap();
536+
537+
Some(bitmap_file.into_file())
538+
} else {
539+
None
540+
};
541+
542+
let mem_backend_path = &params.mem_backend.backend_path;
543+
let mem_state = &microvm_state.vm_state.memory;
544+
let track_dirty_pages = params.enable_diff_snapshots;
545+
546+
let (guest_memory, uffd, socket) = match params.mem_backend.backend_type {
547+
MemBackendType::File => {
548+
if vm_resources.machine_config.huge_pages.is_hugetlbfs() {
549+
return Err(BuildMicrovmFromSnapshotErrorGuestMemoryError::File(
550+
GuestMemoryFromFileError::HugetlbfsSnapshot,
551+
)
552+
.into());
553+
}
554+
(
555+
guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)
556+
.map_err(BuildMicrovmFromSnapshotErrorGuestMemoryError::File)?,
557+
None,
558+
None,
559+
)
560+
}
561+
MemBackendType::Uffd => guest_memory_from_uffd(
562+
mem_backend_path,
563+
mem_state,
564+
track_dirty_pages,
565+
vm_resources.machine_config.huge_pages,
566+
guest_memfd,
567+
userfault_bitmap_memfd.as_ref(),
568+
)
569+
.map_err(BuildMicrovmFromSnapshotErrorGuestMemoryError::Uffd)?,
570+
};
571+
489572
vmm.vm
490-
.register_memory_regions(guest_memory, None)
573+
.register_memory_regions(guest_memory, userfault_bitmap_memfd.as_ref())
491574
.map_err(VmmError::Vm)
492575
.map_err(StartMicrovmError::Internal)?;
493576
vmm.uffd = uffd;
577+
vmm.uffd_socket = socket;
578+
579+
#[cfg(target_arch = "x86_64")]
580+
vmm.vm.set_memory_private().map_err(VmmError::Vm)?;
494581

495582
#[cfg(target_arch = "x86_64")]
496583
{
@@ -956,6 +1043,7 @@ pub(crate) mod tests {
9561043
kvm,
9571044
vm,
9581045
uffd: None,
1046+
uffd_socket: None,
9591047
vcpus_handles: Vec::new(),
9601048
vcpus_exit_evt,
9611049
resource_allocator: ResourceAllocator::new().unwrap(),

src/vmm/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ pub mod initrd;
117117
use std::collections::HashMap;
118118
use std::io;
119119
use std::os::unix::io::AsRawFd;
120+
use std::os::unix::net::UnixStream;
120121
use std::sync::mpsc::RecvTimeoutError;
121122
use std::sync::{Arc, Barrier, Mutex};
122123
use std::time::Duration;
@@ -310,6 +311,8 @@ pub struct Vmm {
310311
pub vm: Vm,
311312
// Save UFFD in order to keep it open in the Firecracker process, as well.
312313
uffd: Option<Uffd>,
314+
// Used for userfault communication with the UFFD handler when secret freedom is enabled
315+
uffd_socket: Option<UnixStream>,
313316
vcpus_handles: Vec<VcpuHandle>,
314317
// Used by Vcpus and devices to initiate teardown; Vmm should never write here.
315318
vcpus_exit_evt: EventFd,

0 commit comments

Comments
 (0)