Skip to content

Commit be0303e

Browse files
committed
vmm: fix kicking vCPU out of KVM_RUN from signal handler
A common scenario for a VMM to regain control over the vCPU thread from the hypervisor is to interrupt the vCPU. A use-case might be the `pause` API call of CHV. VMMs using KVM as hypervisor must use signals for this interception, i.e., a thread sends a signal to the vCPU thread. Sending and handling these signals is inherently racy because the signal sender does not know if the receiving thread is currently in the RUN_VCPU [0] call, or executing userspace VMM code. If we are in kernel space in KVM_RUN, things are easy as KVM just exits with -EINTR. For user-space this is more complicated. For example, it might happen that we receive a signal but the vCPU thread was about to go into the KVM_RUN system call as next instruction. There is no more opportunity to check for any pending signal flag or similar. KVM offers the `immediate_exit` flag [1] as part of the KVM_RUN structure for that. The signal handler of a vCPU is supposed to set this flag, to ensure that we do not miss any events. If the flag is set, KVM_RUN will exit immediately [2]. We will miss signals to the vCPU if the vCPU thread is in userspace VMM code and we do not use the `immediate_exit` flag. We must have access to the KVM_RUN data structure when the signal handler executes in a vCPU thread's context and set the `immediate_exit` [1] flag. This way, the next invocation of KVM_RUN exits immediately and the userspace VMM code can do the normal event handling. We must not use any shared locks between the normal vCPU thread VMM code and the signal handler, as otherwise we might end up in deadlocks. The signal handler therefore needs its dedicated mutable version of KVM_RUN. This commit introduces a (very hacky but good enough for a PoC) solution to this problem. [0] https://docs.kernel.org/virt/kvm/api.html#kvm-run [1] https://docs.kernel.org/virt/kvm/api.html#the-kvm-run-structure [2] https://elixir.bootlin.com/linux/v6.12/source/arch/x86/kvm/x86.c#L11566
1 parent 33460e5 commit be0303e

File tree

4 files changed

+68
-2
lines changed

4 files changed

+68
-2
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vmm/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] }
5252
log = "0.4.22"
5353
# Special fork of micro_http that combines HTTP traffic over a UNIX domain
5454
# socket with UNIX' SCM_RIGHTS mechanism for transferring file descriptors.
55+
kvm-bindings = "0.10.0"
5556
micro_http = { git = "https://github.com/firecracker-microvm/micro-http", branch = "main" }
5657
mshv-bindings = { workspace = true, features = [
5758
"fam-wrappers",

vmm/src/cpu.rs

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@ use std::collections::BTreeMap;
1616
use std::io::Write;
1717
#[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1818
use std::mem::size_of;
19-
#[cfg(feature = "kvm")]
20-
use std::os::fd::RawFd;
2119
use std::os::unix::thread::JoinHandleExt;
2220
use std::sync::atomic::{AtomicBool, Ordering};
2321
use std::sync::{Arc, Barrier, Mutex};
@@ -75,6 +73,8 @@ use vm_migration::{
7573
use vmm_sys_util::eventfd::EventFd;
7674
use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
7775
use zerocopy::{FromBytes, Immutable, IntoBytes};
76+
#[cfg(feature = "kvm")]
77+
use {kvm_bindings::kvm_run, std::cell::Cell, std::os::fd::RawFd, std::sync::RwLock};
7878

7979
#[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
8080
use crate::coredump::{
@@ -92,6 +92,16 @@ use crate::vm::physical_bits;
9292
use crate::vm_config::CpusConfig;
9393
use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID};
9494

95+
#[cfg(feature = "kvm")]
96+
thread_local! {
97+
static KVM_RUN: Cell<*mut kvm_run> = const {Cell::new(core::ptr::null_mut())};
98+
}
99+
#[cfg(feature = "kvm")]
100+
/// Tell signal handler to not access certain stuff anymore during shutdown.
101+
/// Otherwise => panics.
102+
/// Better alternative would be to prevent signals there at all.
103+
pub static IS_IN_SHUTDOWN: RwLock<bool> = RwLock::new(false);
104+
95105
#[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
96106
/// Extract the specified bits of a 64-bit integer.
97107
/// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
@@ -991,6 +1001,28 @@ impl CpuManager {
9911001
vcpu_thread_barrier: Arc<Barrier>,
9921002
inserting: bool,
9931003
) -> Result<()> {
1004+
{
1005+
#[cfg(feature = "kvm")]
1006+
{
1007+
let raw_kvm_fd = vcpu.lock().unwrap().get_kvm_vcpu_raw_fd();
1008+
1009+
// SAFETY: We know the FD is valid and have the proper args.
1010+
let buffer = unsafe {
1011+
libc::mmap(
1012+
core::ptr::null_mut(),
1013+
4096,
1014+
libc::PROT_WRITE | libc::PROT_READ,
1015+
libc::MAP_SHARED,
1016+
raw_kvm_fd,
1017+
0,
1018+
)
1019+
};
1020+
assert_ne!(buffer, libc::MAP_FAILED);
1021+
let kvm_run = buffer.cast::<kvm_run>();
1022+
KVM_RUN.set(kvm_run);
1023+
}
1024+
}
1025+
9941026
let reset_evt = self.reset_evt.try_clone().unwrap();
9951027
let exit_evt = self.exit_evt.try_clone().unwrap();
9961028
#[cfg(feature = "kvm")]
@@ -1069,7 +1101,29 @@ impl CpuManager {
10691101
return;
10701102
}
10711103
}
1104+
1105+
#[cfg(not(feature = "kvm"))]
10721106
extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
1107+
#[cfg(feature = "kvm")]
1108+
extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {
1109+
// Accessing the thread local panics for signals that are received in the shutdown phase.
1110+
// The more graceful way would be to prevent sending signals during that phase at all.
1111+
let lock = IS_IN_SHUTDOWN.read().unwrap();
1112+
if *lock {
1113+
return;
1114+
}
1115+
1116+
let kvm_run = KVM_RUN.get();
1117+
// SAFETY: We mapped the whole structure.
1118+
let ptr_kvm_run_immediate_exit = unsafe { kvm_run.cast::<u8>().add(1) };
1119+
// SAFETY: We know the mapping is valid.
1120+
unsafe {
1121+
core::ptr::write_volatile(ptr_kvm_run_immediate_exit, 1);
1122+
}
1123+
1124+
// TODO ::Release?
1125+
std::sync::atomic::fence(Ordering::SeqCst);
1126+
}
10731127
// This uses an async signal safe handler to kill the vcpu handles.
10741128
register_signal_handler(SIGRTMIN(), handle_signal)
10751129
.expect("Failed to register vcpu signal handler");

vmm/src/lib.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ use crate::api::{
5454
use crate::config::{add_to_config, RestoreConfig};
5555
#[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
5656
use crate::coredump::GuestDebuggable;
57+
#[cfg(feature = "kvm")]
58+
use crate::cpu::IS_IN_SHUTDOWN;
5759
use crate::landlock::Landlock;
5860
use crate::memory_manager::MemoryManager;
5961
#[cfg(all(feature = "kvm", target_arch = "x86_64"))]
@@ -1373,6 +1375,14 @@ impl Vmm {
13731375
vm.release_disk_locks()
13741376
.map_err(|e| MigratableError::UnlockError(anyhow!("{e}")))?;
13751377

1378+
#[cfg(feature = "kvm")]
1379+
// Prevent signal handler to access thread local storage when signals are received
1380+
// close to the end when thread-local storage is already destroyed.
1381+
{
1382+
let mut lock = IS_IN_SHUTDOWN.write().unwrap();
1383+
*lock = true;
1384+
}
1385+
13761386
// Capture snapshot and send it
13771387
let vm_snapshot = vm.snapshot()?;
13781388
let snapshot_data = serde_json::to_vec(&vm_snapshot).unwrap();

0 commit comments

Comments
 (0)