Skip to content

Commit 11e5fae

Browse files
committed
Fix calling kill in advance
Signed-off-by: Ludvig Liljenberg <[email protected]>
1 parent 18e2bb3 commit 11e5fae

File tree

7 files changed

+207
-482
lines changed

7 files changed

+207
-482
lines changed

src/hyperlight_host/benches/benchmarks.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ fn guest_call_benchmark(c: &mut Criterion) {
144144
// Small delay to ensure the guest function is running in VM before interrupting
145145
thread::sleep(std::time::Duration::from_millis(1));
146146
let kill_start = Instant::now();
147-
interrupt_handle.kill();
147+
assert!(interrupt_handle.kill());
148148
kill_start
149149
});
150150

src/hyperlight_host/src/hypervisor/hyperlight_vm.rs

Lines changed: 48 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ use std::convert::TryFrom;
1919
#[cfg(crashdump)]
2020
use std::path::Path;
2121
#[cfg(target_os = "windows")]
22-
use std::sync::atomic::AtomicBool;
23-
#[cfg(any(kvm, mshv))]
22+
use std::sync::atomic::{AtomicBool, AtomicU64};
23+
#[cfg(any(kvm, mshv3))]
2424
use std::sync::atomic::{AtomicBool, AtomicU64};
2525
use std::sync::{Arc, Mutex};
2626

@@ -29,7 +29,7 @@ use tracing::{Span, instrument};
2929
#[cfg(feature = "trace_guest")]
3030
use tracing_opentelemetry::OpenTelemetrySpanExt;
3131

32-
#[cfg(any(kvm, mshv))]
32+
#[cfg(any(kvm, mshv3))]
3333
use super::LinuxInterruptHandle;
3434
#[cfg(target_os = "windows")]
3535
use super::WindowsInterruptHandle;
@@ -40,7 +40,7 @@ use super::{InterruptHandle, InterruptHandleImpl, get_max_log_level};
4040
use crate::HyperlightError::{ExecutionCanceledByHost, NoHypervisorFound};
4141
#[cfg(crashdump)]
4242
use crate::hypervisor::crashdump;
43-
#[cfg(mshv)]
43+
#[cfg(mshv3)]
4444
use crate::hypervisor::hyperv_linux::MshvVm;
4545
#[cfg(target_os = "windows")]
4646
use crate::hypervisor::hyperv_windows::WhpVm;
@@ -98,7 +98,7 @@ impl HyperlightVm {
9898
pml4_addr: u64,
9999
entrypoint: u64,
100100
rsp: u64,
101-
#[cfg_attr(not(any(kvm, mshv)), allow(unused_variables))] config: &SandboxConfiguration,
101+
#[cfg_attr(not(any(kvm, mshv3)), allow(unused_variables))] config: &SandboxConfiguration,
102102
#[cfg(target_os = "windows")] handle: HandleWrapper,
103103
#[cfg(target_os = "windows")] raw_size: usize,
104104
#[cfg(gdb)] gdb_conn: Option<DebugCommChannel<DebugResponse, DebugMsg>>,
@@ -109,7 +109,7 @@ impl HyperlightVm {
109109
let mut vm: Box<dyn Vm> = match get_available_hypervisor() {
110110
#[cfg(kvm)]
111111
Some(HypervisorType::Kvm) => Box::new(KvmVm::new()?),
112-
#[cfg(mshv)]
112+
#[cfg(mshv3)]
113113
Some(HypervisorType::Mshv) => Box::new(MshvVm::new()?),
114114
#[cfg(target_os = "windows")]
115115
Some(HypervisorType::Whp) => Box::new(WhpVm::new(handle, raw_size)?),
@@ -143,10 +143,9 @@ impl HyperlightVm {
143143

144144
let rsp_gp = GuestPtr::try_from(RawPtr::from(rsp))?;
145145

146-
#[cfg(any(kvm, mshv))]
146+
#[cfg(any(kvm, mshv3))]
147147
let interrupt_handle: Arc<dyn InterruptHandleImpl> = Arc::new(LinuxInterruptHandle {
148148
running: AtomicU64::new(0),
149-
cancel_requested: AtomicBool::new(false),
150149
#[cfg(gdb)]
151150
debug_interrupt: AtomicBool::new(false),
152151
#[cfg(all(
@@ -170,8 +169,7 @@ impl HyperlightVm {
170169

171170
#[cfg(target_os = "windows")]
172171
let interrupt_handle: Arc<dyn InterruptHandleImpl> = Arc::new(WindowsInterruptHandle {
173-
running: AtomicBool::new(false),
174-
cancel_requested: AtomicBool::new(false),
172+
state: AtomicU64::new(0),
175173
#[cfg(gdb)]
176174
debug_interrupt: AtomicBool::new(false),
177175
partition_handle: vm.partition_handle(),
@@ -374,28 +372,42 @@ impl HyperlightVm {
374372
&mut self,
375373
#[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
376374
) -> Result<()> {
375+
// ===== KILL() TIMING POINT 1: Between guest function calls =====
376+
// Clear any stale cancellation from a previous guest function call or if kill() was called too early.
377+
// This ensures that kill() called BETWEEN different guest function calls doesn't affect the next call.
378+
//
379+
// If kill() was called and ran to completion BEFORE this line executes:
380+
// - kill() has NO effect on this guest function call because CANCEL_BIT is cleared here.
381+
// - NOTE: stale signals can still be delivered, but they will be ignored.
382+
self.interrupt_handle.clear_cancel();
383+
377384
// Keeps the trace context and open spans
378385
#[cfg(feature = "trace_guest")]
379386
let mut tc = crate::sandbox::trace::TraceContext::new();
380387

381388
let result = loop {
389+
// ===== KILL() TIMING POINT 2: Before set_tid() =====
390+
// If kill() is called and ran to completion BEFORE this line executes:
391+
// - CANCEL_BIT will be set and we will return an early VmExit::Cancelled()
382392
self.interrupt_handle.set_tid();
383-
// Note: if a `InterruptHandle::kill()` called while this thread is **here**
384-
// Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
385393
self.interrupt_handle.set_running();
386394

387-
// Don't run the vcpu if `cancel_requested` is true
388-
//
389-
// Note: if a `InterruptHandle::kill()` called while this thread is **here**
390-
// Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
391395
let exit_reason = if self.interrupt_handle.is_cancelled()
392396
|| self.interrupt_handle.is_debug_interrupted()
393397
{
394398
Ok(VmExit::Cancelled())
395399
} else {
396400
#[cfg(feature = "trace_guest")]
397401
tc.setup_guest_trace(Span::current().context());
402+
403+
// ===== KILL() TIMING POINT 3: Before calling run_vcpu() =====
404+
// If kill() is called and ran to completion BEFORE this line executes:
405+
// - CANCEL_BIT will be set, but it's too late to prevent entering the guest this iteration
406+
// - Signals will interrupt the guest (RUNNING_BIT=true), causing VmExit::Cancelled()
407+
// - If the guest completes before any signals arrive, kill() may have no effect
408+
// - If there are more iterations to do (IO/host func, etc.), the next iteration will be cancelled
398409
let exit_reason = self.vm.run_vcpu();
410+
399411
// End current host trace by closing the current span that captures traces
400412
// happening when a guest exits and re-enters.
401413
#[cfg(feature = "trace_guest")]
@@ -411,24 +423,28 @@ impl HyperlightVm {
411423
exit_reason
412424
};
413425

414-
// Note: if a `InterruptHandle::kill()` called while this thread is **here**
415-
// Then signals will be sent to this thread until `running` is set to false.
416-
// This is fine since the signal handler is a no-op.
426+
// ===== KILL() TIMING POINT 4: Before capturing cancel_requested =====
427+
// If kill() is called and ran to completion BEFORE this line executes:
428+
// - CANCEL_BIT will be set
429+
// - Signals may still be sent (RUNNING_BIT=true) but are harmless no-ops
430+
// - kill() will have no effect on this iteration, but CANCEL_BIT will persist
431+
// - If the loop continues (e.g., for a host call), the next iteration will be cancelled
432+
// - Stale signals from before clear_running() may arrive and kick future iterations,
433+
// but will be filtered out by the cancel_requested check below (and retried).
417434
let cancel_requested = self.interrupt_handle.is_cancelled();
418-
#[cfg(any(kvm, mshv))]
419435
let debug_interrupted = self.interrupt_handle.is_debug_interrupted();
420436

421-
// Note: if a `InterruptHandle::kill()` called while this thread is **here**
422-
// Then `cancel_requested` will be set to true again, which will cancel the **next vcpu run**.
423-
// Additionally signals will be sent to this thread until `running` is set to false.
424-
// This is fine since the signal handler is a no-op.
437+
// ===== KILL() TIMING POINT 5: Before calling clear_running() =====
438+
// Same as point 4.
425439
self.interrupt_handle.clear_running();
426-
self.interrupt_handle.clear_cancel();
427440

428-
// At this point, `running` is `false` so no more signals will be sent to this thread,
429-
// but we may still receive async signals that were sent before this point.
430-
// To prevent those signals from interrupting subsequent calls to `run()` (on other vms!),
431-
// we make sure to check `cancel_requested` before cancelling (see `libc::EINTR` match-arm below).
441+
// ===== KILL() TIMING POINT 6: After calling clear_running() =====
442+
// If kill() is called and ran to completion BEFORE this line executes:
443+
// - CANCEL_BIT will be set but won't affect this iteration, it is never read below this comment
444+
// and cleared at next run() start
445+
// - RUNNING_BIT=false, so no new signals will be sent
446+
// - Stale signals from before clear_running() may arrive and kick future iterations,
447+
// but will be filtered out by the cancel_requested check below (and retried).
432448
match exit_reason {
433449
#[cfg(gdb)]
434450
Ok(VmExit::Debug { dr6, exception }) => {
@@ -517,13 +533,10 @@ impl HyperlightVm {
517533
}
518534
}
519535
Ok(VmExit::Cancelled()) => {
520-
// On Linux (kvm/mshv), if cancellation was not requested for this specific vm,
521-
// the vcpu was interrupted because of debug interrupt or a stale signal that
522-
// meant to be delivered to a previous/other vcpu on this same thread, so let's ignore it
523-
// On Windows, WHvCancelRunVirtualProcessor is explicit, so we always break
524-
#[cfg(any(kvm, mshv))]
536+
// If cancellation was not requested for this specific guest function call,
537+
// the vcpu was interrupted by a stale cancellation from a previous call
525538
if !cancel_requested && !debug_interrupted {
526-
// treat this the same as a VmExit::Retry, the cancel was not meant for this vcpu
539+
// treat this the same as a VmExit::Retry, the cancel was not meant for this call
527540
continue;
528541
}
529542

@@ -538,10 +551,6 @@ impl HyperlightVm {
538551
}
539552
}
540553

541-
if cancel_requested {
542-
self.interrupt_handle.clear_cancel();
543-
}
544-
545554
metrics::counter!(METRIC_GUEST_CANCELLATION).increment(1);
546555
break Err(ExecutionCanceledByHost());
547556
}

src/hyperlight_host/src/hypervisor/hyperv_linux.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@ use std::sync::LazyLock;
2020

2121
#[cfg(gdb)]
2222
use mshv_bindings::DebugRegisters;
23-
#[cfg(mshv2)]
24-
use mshv_bindings::hv_message;
2523
#[cfg(gdb)]
2624
use mshv_bindings::hv_message_type_HVMSG_X64_EXCEPTION_INTERCEPT;
2725
use mshv_bindings::{

0 commit comments

Comments
 (0)