Skip to content

Commit 459fca4

Browse files
authored
[sled-agent] VMM graceful shutdown timeout (#7548)
Presently, sled-agent's `InstanceRunner` has two mechanisms for shutting down a VMM: sending an instance state PUT request to the `propolis-server` process for the `Stopped` state, or forcibly terminating the `propolis-server` and tearing down the zone. At present, when a request to stop an instance is sent to the sled-agent, it uses the first mechanism, where Propolis is politely asked to stop the instance --- which I'll refer to as "graceful shutdown". The forceful termination path is used when asked to unregister an instance where the VMM has not started up yet, when encountering an unrecoverable VMM error, or when killing an instance that was making use of an expunged disk. Currently, these two paths don't really overlap: when Nexus asks a sled-agent to stop an instance, all it will do is politely ask Propolis to please stop the instance gracefully, and will only fall back to violently shooting the zone in the face if Propolis returns the error that indicates it never knew about that instance in the first place. This means that, should a VMM get *stuck* while shutting down the instance, stopping it will never complete successfully, and the Propolis zone won't get cleaned up. This can happen due to e.g. [a Crucible activation that will never complete][1]. Thus, the sled-agent should attempt to violently terminate a Propolis zone when a graceful shutdown of the VMM fails to complete in a timely manner. This commit introduces a timeout for the graceful shutdown process. Now, when we send a PUT request to Propolis with the `Stopped` instance state, the sled-agent will start a 10-minute timer. If no update from Propolis that indicates the instance has transitioned to `Stopped` is received before the timer completes, the sled-agent will proceed with the forceful termination of the Propolis zone. Fixes #4004. Closes #6795 [1]: #4004 (comment)
1 parent 6b6ec4e commit 459fca4

File tree

3 files changed

+290
-7
lines changed

3 files changed

+290
-7
lines changed

Cargo.lock

Lines changed: 12 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -569,7 +569,10 @@ progenitor-client = "0.9.1"
569569
bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "95d6a559890c94e3aa62c8adcd7c4e123ec4c6dc" }
570570
propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "95d6a559890c94e3aa62c8adcd7c4e123ec4c6dc" }
571571
propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "95d6a559890c94e3aa62c8adcd7c4e123ec4c6dc" }
572-
propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "95d6a559890c94e3aa62c8adcd7c4e123ec4c6dc" }
572+
# NOTE: propolis-mock-server is currently out of sync with other propolis deps,
573+
# but when we update propolis to a commit after 2652487, please just bump this
574+
# as well and delete this comment.
575+
propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "2652487c19ede2db2cbc634b0dee3a78848dfea1" }
573576
# NOTE: see above!
574577
proptest = "1.5.0"
575578
qorb = "0.2.1"

sled-agent/src/instance.rs

Lines changed: 274 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,9 @@ use sled_storage::manager::StorageHandle;
4747
use slog::Logger;
4848
use std::net::IpAddr;
4949
use std::net::SocketAddr;
50+
use std::pin::Pin;
5051
use std::sync::Arc;
52+
use std::time::Duration;
5153
use tokio::sync::{mpsc, oneshot};
5254
use uuid::Uuid;
5355

@@ -472,8 +474,31 @@ struct InstanceRunner {
472474
}
473475

474476
impl InstanceRunner {
477+
/// How long to wait for VMM shutdown to complete before forcefully
478+
/// terminating the zone.
479+
const STOP_GRACE_PERIOD: Duration = Duration::from_secs(60 * 10);
480+
475481
async fn run(mut self, mut terminate_rx: mpsc::Receiver<TerminateRequest>) {
476482
use InstanceRequest::*;
483+
484+
// Timeout for stopping the instance gracefully.
485+
//
486+
// When we send Propolis a put-state request to transition to
487+
// Stopped`, we start this timer. If Propolis does not report back
488+
// that it has stopped the instance within `STOP_GRACE_PERIOD`, we
489+
// will forcibly terminate the VMM. This is to ensure that a totally
490+
// stuck VMM does not prevent the Propolis zone from being cleaned up.
491+
let mut stop_timeout = None;
492+
async fn stop_timeout_completed(
493+
stop_timeout: &mut Option<Pin<Box<tokio::time::Sleep>>>,
494+
) {
495+
if let Some(ref mut timeout) = stop_timeout {
496+
timeout.await
497+
} else {
498+
std::future::pending().await
499+
}
500+
}
501+
477502
while !self.should_terminate {
478503
tokio::select! {
479504
biased;
@@ -539,6 +564,25 @@ impl InstanceRunner {
539564
},
540565
}
541566
},
567+
568+
// We are waiting for the VMM to stop, and the grace period has
569+
// elapsed without us hearing back from Propolis that it has
570+
// stopped the instance. Time to terminate it violently!
571+
//
572+
// Note that this must be a lower priority in the `select!` than
573+
// instance monitor requests, as we would prefer to honor a
574+
// message from the instance monitor indicating a successful
575+
// stop, even if we have reached the timeout.
576+
_ = stop_timeout_completed(&mut stop_timeout) => {
577+
warn!(
578+
self.log,
579+
"Instance failed to stop within the grace period, \
580+
terminating it violently!",
581+
);
582+
let mark_failed = false;
583+
self.terminate(mark_failed).await;
584+
}
585+
542586
// Requests to terminate the instance take priority over any
543587
// other request to the instance.
544588
request = terminate_rx.recv() => {
@@ -584,7 +628,22 @@ impl InstanceRunner {
584628
tx.send(Ok(self.current_state()))
585629
.map_err(|_| Error::FailedSendClientClosed)
586630
},
587-
PutState{ state, tx } => {
631+
PutState { state, tx } => {
632+
// If we're going to stop the instance, start
633+
// the timeout after which we will forcefully
634+
// terminate the VMM.
635+
if let VmmStateRequested::Stopped = state {
636+
// Only start the stop timeout if there
637+
// isn't one already, so that additional
638+
// requests to stop coming in don't reset
639+
// the clock.
640+
if stop_timeout.is_none() {
641+
stop_timeout = Some(Box::pin(tokio::time::sleep(
642+
Self::STOP_GRACE_PERIOD
643+
)));
644+
}
645+
}
646+
588647
tx.send(self.put_state(state).await
589648
.map(|r| VmmPutStateResponse { updated_runtime: Some(r) })
590649
.map_err(|e| e.into()))
@@ -639,6 +698,7 @@ impl InstanceRunner {
639698

640699
}
641700
}
701+
642702
self.publish_state_to_nexus().await;
643703

644704
// Okay, now that we've terminated the instance, drain any outstanding
@@ -2383,8 +2443,6 @@ mod tests {
23832443

23842444
// note the "mock" here is different from the vnic/zone contexts above.
23852445
// this is actually running code for a dropshot server from propolis.
2386-
// (might we want a locally-defined fake whose behavior we can control
2387-
// more directly from the test driver?)
23882446
// TODO: factor out, this is also in sled-agent-sim.
23892447
fn propolis_mock_server(
23902448
log: &Logger,
@@ -2407,6 +2465,30 @@ mod tests {
24072465
(srv, client)
24082466
}
24092467

2468+
async fn propolis_mock_set_mode(
2469+
client: &PropolisClient,
2470+
mode: propolis_mock_server::MockMode,
2471+
) {
2472+
let url = format!("{}/mock/mode", client.baseurl());
2473+
client
2474+
.client()
2475+
.put(url)
2476+
.json(&mode)
2477+
.send()
2478+
.await
2479+
.expect("setting mock mode failed unexpectedly");
2480+
}
2481+
2482+
async fn propolis_mock_step(client: &PropolisClient) {
2483+
let url = format!("{}/mock/step", client.baseurl());
2484+
client
2485+
.client()
2486+
.put(url)
2487+
.send()
2488+
.await
2489+
.expect("single-stepping mock server failed unexpectedly");
2490+
}
2491+
24102492
async fn setup_storage_manager(log: &Logger) -> StorageManagerTestHarness {
24112493
let mut harness = StorageManagerTestHarness::new(log).await;
24122494
let raw_disks =
@@ -2957,4 +3039,193 @@ mod tests {
29573039
storage_harness.cleanup().await;
29583040
logctx.cleanup_successful();
29593041
}
3042+
3043+
// Tests a scenario in which a propolis-server process fails to stop in a
3044+
// timely manner (i.e. it's gotten stuck somehow). This test asserts that
3045+
// the sled-agent will eventually forcibly terminate the VMM process, tear
3046+
// down the zone, and tell Nexus that the VMM has been destroyed, even
3047+
// if Propolis can't shut itself down nicely.
3048+
#[tokio::test]
3049+
async fn test_instance_manager_stop_timeout() {
3050+
let logctx = omicron_test_utils::dev::test_setup_log(
3051+
"test_instance_manager_stop_timeout",
3052+
);
3053+
let log = logctx.log.new(o!(FileKv));
3054+
3055+
// automock'd things used during this test
3056+
let _mock_vnic_contexts = mock_vnic_contexts();
3057+
let _mock_zone_contexts = mock_zone_contexts();
3058+
3059+
let mut storage_harness = setup_storage_manager(&logctx.log).await;
3060+
let storage_handle = storage_harness.handle().clone();
3061+
3062+
let FakeNexusParts {
3063+
nexus_client,
3064+
mut state_rx,
3065+
_dns_server,
3066+
_nexus_server,
3067+
} = FakeNexusParts::new(&log).await;
3068+
3069+
let temp_guard = Utf8TempDir::new().unwrap();
3070+
let temp_dir = temp_guard.path().to_string();
3071+
3072+
let (services, _metrics_rx) = fake_instance_manager_services(
3073+
&log,
3074+
storage_handle,
3075+
nexus_client,
3076+
&temp_dir,
3077+
);
3078+
let InstanceManagerServices {
3079+
nexus_client,
3080+
vnic_allocator: _,
3081+
port_manager,
3082+
storage,
3083+
zone_bundler,
3084+
zone_builder_factory,
3085+
metrics_queue,
3086+
} = services;
3087+
3088+
let etherstub = Etherstub("mystub".to_string());
3089+
3090+
let vmm_reservoir_manager = VmmReservoirManagerHandle::stub_for_test();
3091+
3092+
let mgr = crate::instance_manager::InstanceManager::new(
3093+
logctx.log.new(o!("component" => "InstanceManager")),
3094+
nexus_client,
3095+
etherstub,
3096+
port_manager,
3097+
storage,
3098+
zone_bundler,
3099+
zone_builder_factory,
3100+
vmm_reservoir_manager,
3101+
metrics_queue,
3102+
)
3103+
.unwrap();
3104+
3105+
let (propolis_server, propolis_client) =
3106+
propolis_mock_server(&logctx.log);
3107+
let propolis_addr = propolis_server.local_addr();
3108+
3109+
let instance_id = InstanceUuid::new_v4();
3110+
let propolis_id = PropolisUuid::from_untyped_uuid(PROPOLIS_ID);
3111+
let zone_name = propolis_zone_name(&propolis_id);
3112+
let InstanceInitialState {
3113+
hardware,
3114+
vmm_runtime,
3115+
propolis_addr,
3116+
migration_id: _,
3117+
} = fake_instance_initial_state(propolis_addr);
3118+
3119+
let metadata = InstanceMetadata {
3120+
silo_id: Uuid::new_v4(),
3121+
project_id: Uuid::new_v4(),
3122+
};
3123+
let sled_identifiers = SledIdentifiers {
3124+
rack_id: Uuid::new_v4(),
3125+
sled_id: Uuid::new_v4(),
3126+
model: "fake-model".into(),
3127+
revision: 1,
3128+
serial: "fake-serial".into(),
3129+
};
3130+
3131+
mgr.ensure_registered(
3132+
propolis_id,
3133+
InstanceEnsureBody {
3134+
instance_id,
3135+
migration_id: None,
3136+
hardware,
3137+
vmm_runtime,
3138+
propolis_addr,
3139+
metadata,
3140+
},
3141+
sled_identifiers,
3142+
)
3143+
.await
3144+
.unwrap();
3145+
3146+
mgr.ensure_state(propolis_id, VmmStateRequested::Running)
3147+
.await
3148+
.unwrap();
3149+
3150+
timeout(
3151+
TIMEOUT_DURATION,
3152+
state_rx.wait_for(|maybe_state| match maybe_state {
3153+
ReceivedInstanceState::InstancePut(sled_inst_state) => {
3154+
sled_inst_state.vmm_state.state == VmmState::Running
3155+
}
3156+
_ => false,
3157+
}),
3158+
)
3159+
.await
3160+
.expect("timed out waiting for InstanceState::Running in FakeNexus")
3161+
.expect("failed to receive VmmState' InstanceState");
3162+
3163+
// Place the mock propolis in single-step mode.
3164+
propolis_mock_set_mode(
3165+
&propolis_client,
3166+
propolis_mock_server::MockMode::SingleStep,
3167+
)
3168+
.await;
3169+
3170+
// Request the VMM stop
3171+
mgr.ensure_state(propolis_id, VmmStateRequested::Stopped)
3172+
.await
3173+
.unwrap();
3174+
3175+
// Single-step once.
3176+
propolis_mock_step(&propolis_client).await;
3177+
3178+
timeout(
3179+
TIMEOUT_DURATION,
3180+
state_rx.wait_for(|maybe_state| match maybe_state {
3181+
ReceivedInstanceState::InstancePut(sled_inst_state) => {
3182+
sled_inst_state.vmm_state.state == VmmState::Stopping
3183+
}
3184+
_ => false,
3185+
}),
3186+
)
3187+
.await
3188+
.expect("timed out waiting for VmmState::Stopping in FakeNexus")
3189+
.expect("failed to receive FakeNexus' InstanceState");
3190+
3191+
// NOW WE STOP ADVANCING THE MOCK PROPOLIS STATE MACHINE --- IT WILL
3192+
// NEVER REACH `Stopped`.
3193+
3194+
// Expect that the `InstanceRunner` will attempt to halt and remove the
3195+
// zone.
3196+
let halt_ctx = MockZones::halt_and_remove_logged_context();
3197+
halt_ctx.expect().returning(move |_, name| {
3198+
assert_eq!(name, &zone_name);
3199+
Ok(())
3200+
});
3201+
3202+
// Now, pause time and advance the Tokio clock past the stop grace
3203+
// period. This should cause the stop timeout to fire, without requiring
3204+
// the test to actually wait for ten minutes.
3205+
tokio::time::pause();
3206+
tokio::time::advance(
3207+
super::InstanceRunner::STOP_GRACE_PERIOD + Duration::from_secs(1),
3208+
)
3209+
.await;
3210+
tokio::time::resume();
3211+
3212+
// The timeout should now fire and sled-agent will murder propolis,
3213+
// allowing the zone to be destroyed.
3214+
3215+
timeout(
3216+
TIMEOUT_DURATION,
3217+
state_rx.wait_for(|maybe_state| match maybe_state {
3218+
ReceivedInstanceState::InstancePut(sled_inst_state) => {
3219+
sled_inst_state.vmm_state.state == VmmState::Destroyed
3220+
}
3221+
_ => false,
3222+
}),
3223+
)
3224+
.await
3225+
.expect("timed out waiting for VmmState::Stopped in FakeNexus")
3226+
.expect("failed to receive FakeNexus' InstanceState");
3227+
3228+
storage_harness.cleanup().await;
3229+
logctx.cleanup_successful();
3230+
}
29603231
}

0 commit comments

Comments
 (0)