Skip to content

Commit a5fe486

Browse files
committed
vmm: fix kicking vCPU out of KVM_RUN from signal handler
A common scenario for a VMM to regain control over the vCPU thread from the hypervisor is to interrupt the vCPU. A use-case might be the `pause` API call of CHV. VMMs using KVM as hypervisor must use signals for this interception, i.e., a thread sends a signal to the vCPU thread. Sending and handling these signals is inherently racy because the signal sender does not know if the receiving thread is currently in the RUN_VCPU [0] call, or executing userspace VMM code. If we are in kernel space in KVM_RUN, things are easy as KVM just exits with -EINTR. For user-space this is more complicated. For example, it might happen that we receive a signal but the vCPU thread was about to go into the KVM_RUN system call as next instruction. There is no more opportunity to check for any pending signal flag or similar. KVM offers the `immediate_exit` flag [1] as part of the KVM_RUN structure for that. The signal handler of a vCPU is supposed to set this flag, to ensure that we do not miss any events. If the flag is set, KVM_RUN will exit immediately [2]. We will miss signals to the vCPU if the vCPU thread is in userspace VMM code and we do not use the `immediate_exit` flag. We must have access to the KVM_RUN data structure when the signal handler executes in a vCPU thread's context and set the `immediate_exit` [1] flag. This way, the next invocation of KVM_RUN exits immediately and the userspace VMM code can do the normal event handling. We must not use any shared locks between the normal vCPU thread VMM code and the signal handler, as otherwise we might end up in deadlocks. The signal handler therefore needs its dedicated mutable version of KVM_RUN. This commit introduces a (very hacky but good enough for a PoC) solution to this problem. [0] https://docs.kernel.org/virt/kvm/api.html#kvm-run [1] https://docs.kernel.org/virt/kvm/api.html#the-kvm-run-structure [2] https://elixir.bootlin.com/linux/v6.12/source/arch/x86/kvm/x86.c#L11566
1 parent 33460e5 commit a5fe486

File tree

4 files changed

+70
-3
lines changed

4 files changed

+70
-3
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vmm/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] }
5252
log = "0.4.22"
5353
# Special fork of micro_http that combines HTTP traffic over a UNIX domain
5454
# socket with UNIX' SCM_RIGHTS mechanism for transferring file descriptors.
55+
kvm-bindings = "0.10.0"
5556
micro_http = { git = "https://github.com/firecracker-microvm/micro-http", branch = "main" }
5657
mshv-bindings = { workspace = true, features = [
5758
"fam-wrappers",

vmm/src/cpu.rs

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,15 @@
1111
// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
1212
//
1313

14+
use std::cell::Cell;
1415
use std::collections::BTreeMap;
1516
#[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1617
use std::io::Write;
1718
#[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1819
use std::mem::size_of;
19-
#[cfg(feature = "kvm")]
20-
use std::os::fd::RawFd;
2120
use std::os::unix::thread::JoinHandleExt;
2221
use std::sync::atomic::{AtomicBool, Ordering};
23-
use std::sync::{Arc, Barrier, Mutex};
22+
use std::sync::{Arc, Barrier, Mutex, RwLock};
2423
use std::{cmp, io, result, thread};
2524

2625
#[cfg(not(target_arch = "riscv64"))]
@@ -75,6 +74,8 @@ use vm_migration::{
7574
use vmm_sys_util::eventfd::EventFd;
7675
use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
7776
use zerocopy::{FromBytes, Immutable, IntoBytes};
77+
#[cfg(feature = "kvm")]
78+
use {kvm_bindings::kvm_run, std::os::fd::RawFd};
7879

7980
#[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
8081
use crate::coredump::{
@@ -92,6 +93,16 @@ use crate::vm::physical_bits;
9293
use crate::vm_config::CpusConfig;
9394
use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID};
9495

96+
#[cfg(feature = "kvm")]
97+
thread_local! {
98+
static KVM_RUN: Cell<*mut kvm_run> = const {Cell::new(core::ptr::null_mut())};
99+
}
100+
#[cfg(feature = "kvm")]
101+
/// Tell signal handler to not access certain stuff anymore during shutdown.
102+
/// Otherwise => panics.
103+
/// Better alternative would be to prevent signals there at all.
104+
pub static IS_IN_SHUTDOWN: RwLock<bool> = RwLock::new(false);
105+
95106
#[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
96107
/// Extract the specified bits of a 64-bit integer.
97108
/// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
@@ -991,6 +1002,28 @@ impl CpuManager {
9911002
vcpu_thread_barrier: Arc<Barrier>,
9921003
inserting: bool,
9931004
) -> Result<()> {
1005+
{
1006+
#[cfg(feature = "kvm")]
1007+
{
1008+
let raw_kvm_fd = vcpu.lock().unwrap().get_kvm_vcpu_raw_fd();
1009+
1010+
// SAFETY: We know the FD is valid and have the proper args.
1011+
let buffer = unsafe {
1012+
libc::mmap(
1013+
core::ptr::null_mut(),
1014+
4096,
1015+
libc::PROT_WRITE | libc::PROT_READ,
1016+
libc::MAP_SHARED,
1017+
raw_kvm_fd,
1018+
0,
1019+
)
1020+
};
1021+
assert_ne!(buffer, libc::MAP_FAILED);
1022+
let kvm_run = buffer.cast::<kvm_run>();
1023+
KVM_RUN.set(kvm_run);
1024+
}
1025+
}
1026+
9941027
let reset_evt = self.reset_evt.try_clone().unwrap();
9951028
let exit_evt = self.exit_evt.try_clone().unwrap();
9961029
#[cfg(feature = "kvm")]
@@ -1069,7 +1102,29 @@ impl CpuManager {
10691102
return;
10701103
}
10711104
}
1105+
1106+
#[cfg(not(feature = "kvm"))]
10721107
extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
1108+
#[cfg(feature = "kvm")]
1109+
extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {
1110+
// Accessing the thread local panics for signals that are received in the shutdown phase.
1111+
// The more graceful way would be to prevent sending signals during that phase at all.
1112+
let lock = IS_IN_SHUTDOWN.read().unwrap();
1113+
if *lock {
1114+
return;
1115+
}
1116+
1117+
let kvm_run = KVM_RUN.get();
1118+
// SAFETY: We mapped the whole structure.
1119+
let ptr_kvm_run_immediate_exit = unsafe { kvm_run.cast::<u8>().add(1) };
1120+
// SAFETY: We know the mapping is valid.
1121+
unsafe {
1122+
core::ptr::write_volatile(ptr_kvm_run_immediate_exit, 1);
1123+
}
1124+
1125+
// TODO ::Release?
1126+
std::sync::atomic::fence(Ordering::SeqCst);
1127+
}
10731128
// This uses an async signal safe handler to kill the vcpu handles.
10741129
register_signal_handler(SIGRTMIN(), handle_signal)
10751130
.expect("Failed to register vcpu signal handler");

vmm/src/lib.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottabl
4646
use vmm_sys_util::eventfd::EventFd;
4747
use vmm_sys_util::signal::unblock_signal;
4848
use vmm_sys_util::sock_ctrl_msg::ScmSocket;
49+
#[cfg(feature = "kvm")]
50+
use {crate::cpu::IS_IN_SHUTDOWN, std::sync::atomic::Ordering};
4951

5052
use crate::api::{
5153
ApiRequest, ApiResponse, RequestHandler, VmInfoResponse, VmReceiveMigrationData,
@@ -1373,6 +1375,14 @@ impl Vmm {
13731375
vm.release_disk_locks()
13741376
.map_err(|e| MigratableError::UnlockError(anyhow!("{e}")))?;
13751377

1378+
#[cfg(feature = "kvm")]
1379+
// Prevent signal handler to access thread local storage when signals are received
1380+
// close to the end when thread-local storage is already destroyed.
1381+
{
1382+
let mut lock = IS_IN_SHUTDOWN.write().unwrap();
1383+
*lock = true;
1384+
}
1385+
13761386
// Capture snapshot and send it
13771387
let vm_snapshot = vm.snapshot()?;
13781388
let snapshot_data = serde_json::to_vec(&vm_snapshot).unwrap();

0 commit comments

Comments
 (0)