Skip to content

Commit e7e246b

Browse files
committed
update quiesce states to reflect RFD 588
1 parent aba8f4d commit e7e246b

File tree

9 files changed

+522
-298
lines changed

9 files changed

+522
-298
lines changed

nexus/reconfigurator/execution/src/lib.rs

Lines changed: 29 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -611,50 +611,36 @@ fn register_reassign_sagas_step<'a>(
611611
.into();
612612
};
613613

614-
// Re-assign sagas, but only if we're allowed to. If Nexus is
615-
// quiescing, we don't want to assign any new sagas to
616-
// ourselves.
617-
let result = saga_quiesce.reassign_if_possible(async || {
618-
// For any expunged Nexus zones, re-assign in-progress sagas
619-
// to some other Nexus. If this fails for some reason, it
620-
// doesn't affect anything else.
621-
let sec_id = nexus_db_model::SecId::from(nexus_id);
622-
let reassigned = sagas::reassign_sagas_from_expunged(
623-
opctx, datastore, blueprint, sec_id,
624-
)
625-
.await
626-
.context("failed to re-assign sagas");
627-
match reassigned {
628-
Ok(needs_saga_recovery) => (
629-
StepSuccess::new(needs_saga_recovery).build(),
630-
needs_saga_recovery,
631-
),
632-
Err(error) => {
633-
// It's possible that we failed after having
634-
// re-assigned sagas in the database.
635-
let maybe_reassigned = true;
636-
(
637-
StepWarning::new(false, error.to_string())
638-
.build(),
639-
maybe_reassigned,
640-
)
614+
// Re-assign sagas.
615+
Ok(saga_quiesce
616+
.reassign_sagas(async || {
617+
// For any expunged Nexus zones, re-assign in-progress
618+
// sagas to some other Nexus. If this fails for some
619+
// reason, it doesn't affect anything else.
620+
let sec_id = nexus_db_model::SecId::from(nexus_id);
621+
let reassigned = sagas::reassign_sagas_from_expunged(
622+
opctx, datastore, blueprint, sec_id,
623+
)
624+
.await
625+
.context("failed to re-assign sagas");
626+
match reassigned {
627+
Ok(needs_saga_recovery) => (
628+
StepSuccess::new(needs_saga_recovery).build(),
629+
needs_saga_recovery,
630+
),
631+
Err(error) => {
632+
// It's possible that we failed after having
633+
// re-assigned sagas in the database.
634+
let maybe_reassigned = true;
635+
(
636+
StepWarning::new(false, error.to_string())
637+
.build(),
638+
maybe_reassigned,
639+
)
640+
}
641641
}
642-
}
643-
});
644-
645-
match result.await {
646-
// Re-assignment is allowed, and we did try. It may or may
647-
// not have succeeded. Either way, that's reflected in
648-
// `step_result`.
649-
Ok(step_result) => Ok(step_result),
650-
// Re-assignment is disallowed. Report this step skipped
651-
// with an explanation of why.
652-
Err(error) => StepSkipped::new(
653-
false,
654-
InlineErrorChain::new(&error).to_string(),
655-
)
656-
.into(),
657-
}
642+
})
643+
.await)
658644
},
659645
)
660646
.register()

nexus/src/app/background/init.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ use super::tasks::vpc_routes;
131131
use super::tasks::webhook_deliverator;
132132
use crate::Nexus;
133133
use crate::app::oximeter::PRODUCER_LEASE_DURATION;
134+
use crate::app::quiesce::NexusQuiesceHandle;
134135
use crate::app::saga::StartSaga;
135136
use nexus_background_task_interface::Activator;
136137
use nexus_background_task_interface::BackgroundTasks;
@@ -437,7 +438,7 @@ impl BackgroundTasksInitializer {
437438
nexus_id,
438439
task_saga_recovery.clone(),
439440
args.mgs_updates_tx,
440-
args.saga_recovery.quiesce.clone(),
441+
args.nexus_quiesce,
441442
);
442443
let rx_blueprint_exec = blueprint_executor.watcher();
443444
driver.register(TaskDefinition {
@@ -1029,6 +1030,8 @@ pub struct BackgroundTasksData {
10291030
pub webhook_delivery_client: reqwest::Client,
10301031
/// Channel for configuring pending MGS updates
10311032
pub mgs_updates_tx: watch::Sender<PendingMgsUpdates>,
1033+
/// handle for controlling Nexus quiesce
1034+
pub nexus_quiesce: NexusQuiesceHandle,
10321035
}
10331036

10341037
/// Starts the three DNS-propagation-related background tasks for either

nexus/src/app/background/tasks/blueprint_execution.rs

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@
44

55
//! Background task for realizing a plan blueprint
66
7-
use crate::app::background::{Activator, BackgroundTask};
7+
use crate::app::{
8+
background::{Activator, BackgroundTask},
9+
quiesce::NexusQuiesceHandle,
10+
};
811
use futures::FutureExt;
912
use futures::future::BoxFuture;
1013
use internal_dns_resolver::Resolver;
@@ -13,14 +16,12 @@ use nexus_db_queries::db::DataStore;
1316
use nexus_reconfigurator_execution::{
1417
RealizeBlueprintOutput, RequiredRealizeArgs,
1518
};
16-
use nexus_types::{
17-
deployment::{
18-
Blueprint, BlueprintTarget, PendingMgsUpdates, execution::EventBuffer,
19-
},
20-
quiesce::SagaQuiesceHandle,
19+
use nexus_types::deployment::{
20+
Blueprint, BlueprintTarget, PendingMgsUpdates, execution::EventBuffer,
2121
};
2222
use omicron_uuid_kinds::OmicronZoneUuid;
2323
use serde_json::json;
24+
use slog_error_chain::InlineErrorChain;
2425
use std::sync::Arc;
2526
use tokio::sync::watch;
2627
use update_engine::NestedError;
@@ -35,7 +36,7 @@ pub struct BlueprintExecutor {
3536
tx: watch::Sender<usize>,
3637
saga_recovery: Activator,
3738
mgs_update_tx: watch::Sender<PendingMgsUpdates>,
38-
saga_quiesce: SagaQuiesceHandle,
39+
nexus_quiesce: NexusQuiesceHandle,
3940
}
4041

4142
impl BlueprintExecutor {
@@ -48,7 +49,7 @@ impl BlueprintExecutor {
4849
nexus_id: OmicronZoneUuid,
4950
saga_recovery: Activator,
5051
mgs_update_tx: watch::Sender<PendingMgsUpdates>,
51-
saga_quiesce: SagaQuiesceHandle,
52+
nexus_quiesce: NexusQuiesceHandle,
5253
) -> BlueprintExecutor {
5354
let (tx, _) = watch::channel(0);
5455
BlueprintExecutor {
@@ -59,7 +60,7 @@ impl BlueprintExecutor {
5960
tx,
6061
saga_recovery,
6162
mgs_update_tx,
62-
saga_quiesce,
63+
nexus_quiesce,
6364
}
6465
}
6566

@@ -87,6 +88,47 @@ impl BlueprintExecutor {
8788
};
8889

8990
let (bp_target, blueprint) = &*update;
91+
92+
// Regardless of anything else: propagate whatever this blueprint
93+
// says about our quiescing state.
94+
//
95+
// During startup under normal operation, the blueprint will reflect
96+
// that we're not quiescing. Propagating this will enable sagas to
97+
// be created elsewhere in Nexus.
98+
//
99+
// At some point during an upgrade, we'll encounter a blueprint that
100+
// reflects that we are quiescing. Propagating this will disable sagas
101+
// from being created.
102+
//
103+
// In all other cases, this will have no effect.
104+
//
105+
// We do this now, before doing anything else, for two reasons: (1)
106+
// during startup, we want to do this ASAP to minimize unnecessary saga
107+
// creation failures (i.e., don't wait until we try to execute the
108+
// blueprint before enabling sagas, since we already know if we're
109+
// quiescing or not); and (2) because we want to do it even if blueprint
110+
// execution is disabled.
111+
match blueprint.nexus_quiescing(self.nexus_id) {
112+
Ok(quiescing) => {
113+
debug!(
114+
&opctx.log,
115+
"blueprint execution: quiesce check";
116+
"quiescing" => quiescing
117+
);
118+
self.nexus_quiesce.set_quiescing(quiescing);
119+
}
120+
Err(error) => {
121+
// This should be impossible. But it doesn't really affect
122+
// anything else so there's no reason to stop execution.
123+
error!(
124+
&opctx.log,
125+
"blueprint execution: failed to determine if this Nexus \
126+
is quiescing";
127+
InlineErrorChain::new(&*error)
128+
);
129+
}
130+
};
131+
90132
if !bp_target.enabled {
91133
warn!(&opctx.log,
92134
"Blueprint execution: skipped";
@@ -119,7 +161,7 @@ impl BlueprintExecutor {
119161
blueprint,
120162
sender,
121163
mgs_updates: self.mgs_update_tx.clone(),
122-
saga_quiesce: self.saga_quiesce.clone(),
164+
saga_quiesce: self.nexus_quiesce.sagas(),
123165
}
124166
.as_nexus(self.nexus_id),
125167
)
@@ -181,6 +223,7 @@ impl BackgroundTask for BlueprintExecutor {
181223
mod test {
182224
use super::BlueprintExecutor;
183225
use crate::app::background::{Activator, BackgroundTask};
226+
use crate::app::quiesce::NexusQuiesceHandle;
184227
use httptest::Expectation;
185228
use httptest::matchers::{not, request};
186229
use httptest::responders::status_code;
@@ -207,7 +250,6 @@ mod test {
207250
PlanningReport, blueprint_zone_type,
208251
};
209252
use nexus_types::external_api::views::SledState;
210-
use nexus_types::quiesce::SagaQuiesceHandle;
211253
use omicron_common::api::external;
212254
use omicron_common::api::external::Generation;
213255
use omicron_common::zpool_name::ZpoolName;
@@ -390,7 +432,7 @@ mod test {
390432
OmicronZoneUuid::new_v4(),
391433
Activator::new(),
392434
dummy_tx,
393-
SagaQuiesceHandle::new(opctx.log.clone()),
435+
NexusQuiesceHandle::new(&opctx.log, datastore.clone()),
394436
);
395437

396438
// Now we're ready.

nexus/src/app/background/tasks/blueprint_planner.rs

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -278,18 +278,15 @@ impl BackgroundTask for BlueprintPlanner {
278278
#[cfg(test)]
279279
mod test {
280280
use super::*;
281-
use crate::app::background::Activator;
282281
use crate::app::background::tasks::blueprint_execution::BlueprintExecutor;
283282
use crate::app::background::tasks::blueprint_load::TargetBlueprintLoader;
284283
use crate::app::background::tasks::inventory_collection::InventoryCollector;
284+
use crate::app::{background::Activator, quiesce::NexusQuiesceHandle};
285285
use nexus_inventory::now_db_precision;
286286
use nexus_test_utils_macros::nexus_test;
287-
use nexus_types::{
288-
deployment::{
289-
PendingMgsUpdates, PlannerChickenSwitches,
290-
ReconfiguratorChickenSwitches,
291-
},
292-
quiesce::SagaQuiesceHandle,
287+
use nexus_types::deployment::{
288+
PendingMgsUpdates, PlannerChickenSwitches,
289+
ReconfiguratorChickenSwitches,
293290
};
294291
use omicron_uuid_kinds::OmicronZoneUuid;
295292

@@ -429,7 +426,7 @@ mod test {
429426
OmicronZoneUuid::new_v4(),
430427
Activator::new(),
431428
dummy_tx,
432-
SagaQuiesceHandle::new(opctx.log.clone()),
429+
NexusQuiesceHandle::new(&opctx.log, datastore.clone()),
433430
);
434431
let value = executor.activate(&opctx).await;
435432
let value = value.as_object().expect("response is not a JSON object");

nexus/src/app/mod.rs

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ use nexus_db_queries::db;
2727
use nexus_mgs_updates::ArtifactCache;
2828
use nexus_mgs_updates::MgsUpdateDriver;
2929
use nexus_types::deployment::PendingMgsUpdates;
30-
use nexus_types::quiesce::SagaQuiesceHandle;
3130
use omicron_common::address::DENDRITE_PORT;
3231
use omicron_common::address::MGD_PORT;
3332
use omicron_common::address::MGS_PORT;
@@ -111,11 +110,11 @@ pub(crate) mod sagas;
111110
// TODO: When referring to API types, we should try to include
112111
// the prefix unless it is unambiguous.
113112

113+
use crate::app::quiesce::NexusQuiesceHandle;
114114
pub(crate) use nexus_db_model::MAX_NICS_PER_INSTANCE;
115115
pub(crate) use nexus_db_queries::db::queries::disk::MAX_DISKS_PER_INSTANCE;
116116
use nexus_mgs_updates::DEFAULT_RETRY_TIMEOUT;
117117
use nexus_types::internal_api::views::MgsUpdateDriverStatus;
118-
use nexus_types::internal_api::views::QuiesceState;
119118
use sagas::demo::CompletingDemoSagas;
120119

121120
// XXX: Might want to recast as max *floating* IPs, we have at most one
@@ -280,11 +279,8 @@ pub struct Nexus {
280279
#[allow(dead_code)]
281280
repo_depot_resolver: Box<dyn qorb::resolver::Resolver>,
282281

283-
/// whether Nexus is quiescing, and how far it's gotten
284-
quiesce: watch::Sender<QuiesceState>,
285-
286-
/// details about saga quiescing
287-
saga_quiesce: SagaQuiesceHandle,
282+
/// state of overall Nexus quiesce activity
283+
quiesce: NexusQuiesceHandle,
288284
}
289285

290286
impl Nexus {
@@ -336,6 +332,8 @@ impl Nexus {
336332
sec_store,
337333
));
338334

335+
let quiesce = NexusQuiesceHandle::new(&log, db_datastore.clone());
336+
339337
// It's a bit of a red flag to use an unbounded channel.
340338
//
341339
// This particular channel is used to send a Uuid from the saga executor
@@ -360,14 +358,11 @@ impl Nexus {
360358
// task. If someone changed the config, they'd have to remember to
361359
// update this here. This doesn't seem worth it.
362360
let (saga_create_tx, saga_recovery_rx) = mpsc::unbounded_channel();
363-
let saga_quiesce = SagaQuiesceHandle::new(
364-
log.new(o!("component" => "SagaQuiesceHandle")),
365-
);
366361
let sagas = Arc::new(SagaExecutor::new(
367362
Arc::clone(&sec_client),
368363
log.new(o!("component" => "SagaExecutor")),
369364
saga_create_tx,
370-
saga_quiesce.clone(),
365+
quiesce.sagas(),
371366
));
372367

373368
// Create a channel for replicating repository artifacts. 16 is a
@@ -465,8 +460,6 @@ impl Nexus {
465460
let mgs_update_status_rx = mgs_update_driver.status_rx();
466461
let _mgs_driver_task = tokio::spawn(mgs_update_driver.run());
467462

468-
let (quiesce, _) = watch::channel(QuiesceState::running());
469-
470463
let nexus = Nexus {
471464
id: config.deployment.id,
472465
rack_id,
@@ -520,7 +513,6 @@ impl Nexus {
520513
mgs_resolver,
521514
repo_depot_resolver,
522515
quiesce,
523-
saga_quiesce,
524516
};
525517

526518
// TODO-cleanup all the extra Arcs here seems wrong
@@ -570,14 +562,15 @@ impl Nexus {
570562
webhook_delivery_client: task_nexus
571563
.webhook_delivery_client
572564
.clone(),
565+
nexus_quiesce: task_nexus.quiesce.clone(),
573566

574567
saga_recovery: SagaRecoveryHelpers {
575568
recovery_opctx: saga_recovery_opctx,
576569
maker: task_nexus.clone(),
577570
sec_client: sec_client.clone(),
578571
registry: sagas::ACTION_REGISTRY.clone(),
579572
sagas_started_rx: saga_recovery_rx,
580-
quiesce: task_nexus.saga_quiesce.clone(),
573+
quiesce: task_nexus.quiesce.sagas(),
581574
},
582575
tuf_artifact_replication_rx,
583576
mgs_updates_tx,

0 commit comments

Comments
 (0)