Skip to content

Commit 71fbb2b

Browse files
authored
nexus quiesce needs to deal with saga recovery and re-assignment (#8794)
1 parent d1f4c1b commit 71fbb2b

File tree

18 files changed

+1298
-306
lines changed

18 files changed

+1298
-306
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

clients/nexus-client/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ progenitor::generate_api!(
8888
}
8989
);
9090

91-
impl IdOrdItem for types::RunningSagaInfo {
91+
impl IdOrdItem for types::PendingSagaInfo {
9292
type Key<'a> = Uuid;
9393

9494
fn key(&self) -> Self::Key<'_> {

dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,12 @@ async fn quiesce_show(
117117
}
118118
}
119119

120-
println!("sagas running: {}", quiesce.sagas_running.len());
121-
for saga in &quiesce.sagas_running {
120+
println!("sagas running: {}", quiesce.sagas_pending.len());
121+
for saga in &quiesce.sagas_pending {
122122
println!(
123-
" saga {} started at {} ({})",
123+
" saga {} pending since {} ({})",
124124
saga.saga_id,
125-
humantime::format_rfc3339_millis(saga.time_started.into()),
125+
humantime::format_rfc3339_millis(saga.time_pending.into()),
126126
saga.saga_name
127127
);
128128
}

dev-tools/reconfigurator-exec-unsafe/src/main.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use nexus_reconfigurator_execution::{RequiredRealizeArgs, realize_blueprint};
2121
use nexus_types::deployment::Blueprint;
2222
use nexus_types::deployment::PendingMgsUpdates;
2323
use nexus_types::deployment::SledFilter;
24+
use nexus_types::quiesce::SagaQuiesceHandle;
2425
use omicron_common::api::external::DataPageParams;
2526
use omicron_uuid_kinds::GenericUuid;
2627
use omicron_uuid_kinds::OmicronZoneUuid;
@@ -248,6 +249,7 @@ impl ReconfiguratorExec {
248249
// closed. Clone this sender so that it doesn't get shut down
249250
// right away.
250251
mgs_updates: mgs_updates.clone(),
252+
saga_quiesce: SagaQuiesceHandle::new(opctx.log.clone()),
251253
}
252254
.into(),
253255
)

nexus/reconfigurator/execution/src/lib.rs

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use nexus_types::deployment::execution::{
2121
Overridables, ReconfiguratorExecutionSpec, SharedStepHandle, Sled,
2222
StepHandle, StepResult, UpdateEngine,
2323
};
24+
use nexus_types::quiesce::SagaQuiesceHandle;
2425
use omicron_uuid_kinds::OmicronZoneUuid;
2526
use slog::info;
2627
use slog_error_chain::InlineErrorChain;
@@ -56,6 +57,7 @@ pub struct RealizeArgs<'a> {
5657
pub sender: mpsc::Sender<Event>,
5758
pub overrides: Option<&'a Overridables>,
5859
pub mgs_updates: watch::Sender<PendingMgsUpdates>,
60+
pub saga_quiesce: SagaQuiesceHandle,
5961
}
6062

6163
impl<'a> RealizeArgs<'a> {
@@ -103,6 +105,7 @@ pub struct RequiredRealizeArgs<'a> {
103105
pub blueprint: &'a Blueprint,
104106
pub sender: mpsc::Sender<Event>,
105107
pub mgs_updates: watch::Sender<PendingMgsUpdates>,
108+
pub saga_quiesce: SagaQuiesceHandle,
106109
}
107110

108111
impl<'a> From<RequiredRealizeArgs<'a>> for RealizeArgs<'a> {
@@ -117,6 +120,7 @@ impl<'a> From<RequiredRealizeArgs<'a>> for RealizeArgs<'a> {
117120
sender: value.sender,
118121
overrides: None,
119122
mgs_updates: value.mgs_updates,
123+
saga_quiesce: value.saga_quiesce,
120124
}
121125
}
122126
}
@@ -162,6 +166,7 @@ pub async fn realize_blueprint(
162166
sender,
163167
overrides,
164168
mgs_updates,
169+
saga_quiesce,
165170
} = exec_ctx;
166171

167172
let opctx = opctx.child(BTreeMap::from([(
@@ -262,6 +267,7 @@ pub async fn realize_blueprint(
262267
datastore,
263268
blueprint,
264269
nexus_id,
270+
saga_quiesce,
265271
);
266272

267273
register_cockroachdb_settings_step(
@@ -593,6 +599,7 @@ fn register_reassign_sagas_step<'a>(
593599
datastore: &'a DataStore,
594600
blueprint: &'a Blueprint,
595601
nexus_id: Option<OmicronZoneUuid>,
602+
saga_quiesce: SagaQuiesceHandle,
596603
) -> StepHandle<bool> {
597604
registrar
598605
.new_step(
@@ -604,22 +611,49 @@ fn register_reassign_sagas_step<'a>(
604611
.into();
605612
};
606613

607-
// For any expunged Nexus zones, re-assign in-progress sagas to
608-
// some other Nexus. If this fails for some reason, it doesn't
609-
// affect anything else.
610-
let sec_id = nexus_db_model::SecId::from(nexus_id);
611-
let reassigned = sagas::reassign_sagas_from_expunged(
612-
opctx, datastore, blueprint, sec_id,
613-
)
614-
.await
615-
.context("failed to re-assign sagas");
616-
match reassigned {
617-
Ok(needs_saga_recovery) => {
618-
Ok(StepSuccess::new(needs_saga_recovery).build())
619-
}
620-
Err(error) => {
621-
Ok(StepWarning::new(false, error.to_string()).build())
614+
// Re-assign sagas, but only if we're allowed to. If Nexus is
615+
// quiescing, we don't want to assign any new sagas to
616+
// ourselves.
617+
let result = saga_quiesce.reassign_if_possible(async || {
618+
// For any expunged Nexus zones, re-assign in-progress sagas
619+
// to some other Nexus. If this fails for some reason, it
620+
// doesn't affect anything else.
621+
let sec_id = nexus_db_model::SecId::from(nexus_id);
622+
let reassigned = sagas::reassign_sagas_from_expunged(
623+
opctx, datastore, blueprint, sec_id,
624+
)
625+
.await
626+
.context("failed to re-assign sagas");
627+
match reassigned {
628+
Ok(needs_saga_recovery) => (
629+
StepSuccess::new(needs_saga_recovery).build(),
630+
needs_saga_recovery,
631+
),
632+
Err(error) => {
633+
// It's possible that we failed after having
634+
// re-assigned sagas in the database.
635+
let maybe_reassigned = true;
636+
(
637+
StepWarning::new(false, error.to_string())
638+
.build(),
639+
maybe_reassigned,
640+
)
641+
}
622642
}
643+
});
644+
645+
match result.await {
646+
// Re-assignment is allowed, and we did try. It may or may
647+
// not have succeeded. Either way, that's reflected in
648+
// `step_result`.
649+
Ok(step_result) => Ok(step_result),
650+
// Re-assignment is disallowed. Report this step skipped
651+
// with an explanation of why.
652+
Err(error) => StepSkipped::new(
653+
false,
654+
InlineErrorChain::new(&error).to_string(),
655+
)
656+
.into(),
623657
}
624658
},
625659
)

nexus/reconfigurator/execution/src/test_utils.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,12 @@ use std::net::Ipv6Addr;
88

99
use internal_dns_resolver::Resolver;
1010
use nexus_db_queries::{context::OpContext, db::DataStore};
11-
use nexus_types::deployment::{
12-
Blueprint, PendingMgsUpdates,
13-
execution::{EventBuffer, Overridables},
11+
use nexus_types::{
12+
deployment::{
13+
Blueprint, PendingMgsUpdates,
14+
execution::{EventBuffer, Overridables},
15+
},
16+
quiesce::SagaQuiesceHandle,
1417
};
1518
use omicron_uuid_kinds::OmicronZoneUuid;
1619
use update_engine::TerminalKind;
@@ -37,6 +40,8 @@ pub(crate) async fn realize_blueprint_and_expect(
3740

3841
// This helper function does not support MGS-managed updates.
3942
let (mgs_updates, _rx) = watch::channel(PendingMgsUpdates::new());
43+
// This helper function does not mess with quiescing.
44+
let saga_quiesce = SagaQuiesceHandle::new(opctx.log.clone());
4045
let nexus_id = OmicronZoneUuid::new_v4();
4146
let output = crate::realize_blueprint(
4247
RequiredRealizeArgs {
@@ -47,6 +52,7 @@ pub(crate) async fn realize_blueprint_and_expect(
4752
blueprint,
4853
sender,
4954
mgs_updates,
55+
saga_quiesce,
5056
}
5157
.with_overrides(overrides)
5258
.as_nexus(OmicronZoneUuid::new_v4()),

nexus/src/app/background/init.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,7 @@ impl BackgroundTasksInitializer {
438438
nexus_id,
439439
task_saga_recovery.clone(),
440440
args.mgs_updates_tx,
441+
args.saga_recovery.quiesce.clone(),
441442
);
442443
let rx_blueprint_exec = blueprint_executor.watcher();
443444
driver.register(TaskDefinition {

nexus/src/app/background/tasks/blueprint_execution.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,11 @@ use nexus_db_queries::db::DataStore;
1313
use nexus_reconfigurator_execution::{
1414
RealizeBlueprintOutput, RequiredRealizeArgs,
1515
};
16-
use nexus_types::deployment::{
17-
Blueprint, BlueprintTarget, PendingMgsUpdates, execution::EventBuffer,
16+
use nexus_types::{
17+
deployment::{
18+
Blueprint, BlueprintTarget, PendingMgsUpdates, execution::EventBuffer,
19+
},
20+
quiesce::SagaQuiesceHandle,
1821
};
1922
use omicron_uuid_kinds::OmicronZoneUuid;
2023
use serde_json::json;
@@ -32,6 +35,7 @@ pub struct BlueprintExecutor {
3235
tx: watch::Sender<usize>,
3336
saga_recovery: Activator,
3437
mgs_update_tx: watch::Sender<PendingMgsUpdates>,
38+
saga_quiesce: SagaQuiesceHandle,
3539
}
3640

3741
impl BlueprintExecutor {
@@ -44,6 +48,7 @@ impl BlueprintExecutor {
4448
nexus_id: OmicronZoneUuid,
4549
saga_recovery: Activator,
4650
mgs_update_tx: watch::Sender<PendingMgsUpdates>,
51+
saga_quiesce: SagaQuiesceHandle,
4752
) -> BlueprintExecutor {
4853
let (tx, _) = watch::channel(0);
4954
BlueprintExecutor {
@@ -54,6 +59,7 @@ impl BlueprintExecutor {
5459
tx,
5560
saga_recovery,
5661
mgs_update_tx,
62+
saga_quiesce,
5763
}
5864
}
5965

@@ -113,6 +119,7 @@ impl BlueprintExecutor {
113119
blueprint,
114120
sender,
115121
mgs_updates: self.mgs_update_tx.clone(),
122+
saga_quiesce: self.saga_quiesce.clone(),
116123
}
117124
.as_nexus(self.nexus_id),
118125
)
@@ -200,6 +207,7 @@ mod test {
200207
blueprint_zone_type,
201208
};
202209
use nexus_types::external_api::views::SledState;
210+
use nexus_types::quiesce::SagaQuiesceHandle;
203211
use omicron_common::api::external;
204212
use omicron_common::api::external::Generation;
205213
use omicron_common::zpool_name::ZpoolName;
@@ -380,6 +388,7 @@ mod test {
380388
OmicronZoneUuid::new_v4(),
381389
Activator::new(),
382390
dummy_tx,
391+
SagaQuiesceHandle::new(opctx.log.clone()),
383392
);
384393

385394
// Now we're ready.

nexus/src/app/background/tasks/blueprint_planner.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -272,9 +272,12 @@ mod test {
272272
use crate::app::background::tasks::inventory_collection::InventoryCollector;
273273
use nexus_inventory::now_db_precision;
274274
use nexus_test_utils_macros::nexus_test;
275-
use nexus_types::deployment::{
276-
PendingMgsUpdates, PlannerChickenSwitches,
277-
ReconfiguratorChickenSwitches,
275+
use nexus_types::{
276+
deployment::{
277+
PendingMgsUpdates, PlannerChickenSwitches,
278+
ReconfiguratorChickenSwitches,
279+
},
280+
quiesce::SagaQuiesceHandle,
278281
};
279282
use omicron_uuid_kinds::OmicronZoneUuid;
280283

@@ -411,6 +414,7 @@ mod test {
411414
OmicronZoneUuid::new_v4(),
412415
Activator::new(),
413416
dummy_tx,
417+
SagaQuiesceHandle::new(opctx.log.clone()),
414418
);
415419
let value = executor.activate(&opctx).await;
416420
let value = value.as_object().expect("response is not a JSON object");

0 commit comments

Comments
 (0)