Skip to content

Commit 32f0348

Browse files
authored
quiesce needs to keep track of blueprint ids (#8919)
1 parent 5356bd8 commit 32f0348

File tree

6 files changed

+593
-87
lines changed

6 files changed

+593
-87
lines changed

dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs

Lines changed: 74 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@ use chrono::TimeDelta;
1111
use chrono::Utc;
1212
use clap::Args;
1313
use clap::Subcommand;
14+
use nexus_client::types::PendingRecovery;
1415
use nexus_client::types::QuiesceState;
16+
use nexus_client::types::QuiesceStatus;
17+
use nexus_client::types::SagaQuiesceStatus;
1518
use std::time::Duration;
1619

1720
#[derive(Debug, Args)]
@@ -31,9 +34,9 @@ pub enum QuiesceCommands {
3134

3235
#[derive(Debug, Args)]
3336
pub struct QuiesceShowArgs {
34-
/// Show details about held database connections
37+
/// Show stack traces for held database connections
3538
#[clap(short, long, default_value_t = false)]
36-
verbose: bool,
39+
stacks: bool,
3740
}
3841

3942
pub async fn cmd_nexus_quiesce(
@@ -60,7 +63,10 @@ async fn quiesce_show(
6063
.await
6164
.context("fetching quiesce state")?
6265
.into_inner();
63-
match quiesce.state {
66+
67+
let QuiesceStatus { db_claims, sagas, state } = quiesce;
68+
69+
match state {
6470
QuiesceState::Undetermined => {
6571
println!("has not yet determined if it is quiescing");
6672
}
@@ -145,25 +151,83 @@ async fn quiesce_show(
145151
}
146152
}
147153

148-
println!("sagas running: {}", quiesce.sagas_pending.len());
149-
for saga in &quiesce.sagas_pending {
154+
let SagaQuiesceStatus {
155+
sagas_pending,
156+
drained_blueprint_id,
157+
first_recovery_complete,
158+
new_sagas_allowed,
159+
reassignment_blueprint_id,
160+
reassignment_generation,
161+
reassignment_pending,
162+
recovered_blueprint_id,
163+
recovered_reassignment_generation,
164+
recovery_pending,
165+
} = sagas;
166+
167+
println!("saga quiesce:");
168+
println!(" new sagas: {:?}", new_sagas_allowed);
169+
println!(
170+
" drained as of blueprint: {}",
171+
drained_blueprint_id
172+
.map(|s| s.to_string())
173+
.as_deref()
174+
.unwrap_or("none")
175+
);
176+
println!(
177+
" blueprint for last completed recovery pass: {}",
178+
recovered_blueprint_id
179+
.map(|s| s.to_string())
180+
.as_deref()
181+
.unwrap_or("none")
182+
);
183+
println!(
184+
" blueprint for last reassignment pass: {}",
185+
reassignment_blueprint_id
186+
.map(|s| s.to_string())
187+
.as_deref()
188+
.unwrap_or("none")
189+
);
190+
println!(
191+
" reassignment generation: {} (pass running: {})",
192+
reassignment_generation,
193+
if reassignment_pending { "yes" } else { "no" }
194+
);
195+
println!(" recovered generation: {}", recovered_reassignment_generation);
196+
println!(
197+
" recovered at least once successfully: {}",
198+
if first_recovery_complete { "yes" } else { "no" },
199+
);
200+
print!(" recovery pending: ");
201+
if let Some(PendingRecovery { generation, blueprint_id }) = recovery_pending
202+
{
203+
println!(
204+
"yes (generation {}, blueprint id {})",
205+
generation,
206+
blueprint_id.map(|s| s.to_string()).as_deref().unwrap_or("none")
207+
);
208+
} else {
209+
println!("no");
210+
}
211+
212+
println!(" sagas running: {}", sagas_pending.len());
213+
for saga in &sagas_pending {
150214
println!(
151-
" saga {} pending since {} ({})",
215+
" saga {} pending since {} ({})",
152216
saga.saga_id,
153217
humantime::format_rfc3339_millis(saga.time_pending.into()),
154218
saga.saga_name
155219
);
156220
}
157221

158-
println!("database connections held: {}", quiesce.db_claims.len());
159-
for claim in &quiesce.db_claims {
222+
println!("database connections held: {}", db_claims.len());
223+
for claim in &db_claims {
160224
println!(
161225
" claim {} held since {} ({} ago)",
162226
claim.id,
163227
claim.held_since,
164228
format_time_delta(Utc::now() - claim.held_since),
165229
);
166-
if args.verbose {
230+
if args.stacks {
167231
println!(" acquired by:");
168232
println!("{}", textwrap::indent(&claim.debug, " "));
169233
}
@@ -177,7 +241,7 @@ async fn quiesce_start(
177241
_token: DestructiveOperationToken,
178242
) -> Result<(), anyhow::Error> {
179243
client.quiesce_start().await.context("quiescing Nexus")?;
180-
quiesce_show(client, &QuiesceShowArgs { verbose: false }).await
244+
quiesce_show(client, &QuiesceShowArgs { stacks: false }).await
181245
}
182246

183247
fn format_duration_ms(duration: Duration) -> String {

nexus/reconfigurator/execution/src/lib.rs

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use nexus_types::deployment::execution::{
2222
StepHandle, StepResult, UpdateEngine,
2323
};
2424
use nexus_types::quiesce::SagaQuiesceHandle;
25+
use nexus_types::quiesce::SagaReassignmentDone;
2526
use omicron_uuid_kinds::OmicronZoneUuid;
2627
use slog::info;
2728
use slog_error_chain::InlineErrorChain;
@@ -662,18 +663,16 @@ fn register_reassign_sagas_step<'a>(
662663
match reassigned {
663664
Ok(needs_saga_recovery) => (
664665
StepSuccess::new(needs_saga_recovery).build(),
665-
needs_saga_recovery,
666+
SagaReassignmentDone::ReassignedAllAsOf(
667+
blueprint.id,
668+
needs_saga_recovery,
669+
),
670+
),
671+
Err(error) => (
672+
StepWarning::new(false, error.to_string())
673+
.build(),
674+
SagaReassignmentDone::Indeterminate,
666675
),
667-
Err(error) => {
668-
// It's possible that we failed after having
669-
// re-assigned sagas in the database.
670-
let maybe_reassigned = true;
671-
(
672-
StepWarning::new(false, error.to_string())
673-
.build(),
674-
maybe_reassigned,
675-
)
676-
}
677676
}
678677
})
679678
.await)

nexus/src/app/quiesce.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ impl super::Nexus {
3232
) -> LookupResult<QuiesceStatus> {
3333
opctx.authorize(authz::Action::Read, &authz::QUIESCE_STATE).await?;
3434
let state = self.quiesce.state();
35-
let sagas_pending = self.quiesce.sagas().sagas_pending();
35+
let sagas = self.quiesce.sagas().status();
3636
let db_claims = self.datastore().claims_held();
37-
Ok(QuiesceStatus { state, sagas_pending, db_claims })
37+
Ok(QuiesceStatus { state, sagas, db_claims })
3838
}
3939
}
4040

@@ -283,7 +283,7 @@ mod test {
283283
assert!(duration_total >= duration_draining_db);
284284
assert!(duration_total >= duration_recording_quiesce);
285285
assert!(duration_total <= (after - before).to_std().unwrap());
286-
assert!(status.sagas_pending.is_empty());
286+
assert!(status.sagas.sagas_pending.is_empty());
287287
assert!(status.db_claims.is_empty());
288288
}
289289

@@ -357,7 +357,9 @@ mod test {
357357
quiesce_status.state,
358358
QuiesceState::DrainingSagas { .. }
359359
);
360-
assert!(quiesce_status.sagas_pending.contains_key(&demo_saga.saga_id));
360+
assert!(
361+
quiesce_status.sagas.sagas_pending.contains_key(&demo_saga.saga_id)
362+
);
361363
// We should see at least one held database claim from the one we took
362364
// above.
363365
assert!(!quiesce_status.db_claims.is_empty());
@@ -421,7 +423,7 @@ mod test {
421423
if !matches!(rv.state, QuiesceState::DrainingDb { .. }) {
422424
return Err(CondCheckError::<NexusClientError>::NotYet);
423425
}
424-
assert!(rv.sagas_pending.is_empty());
426+
assert!(rv.sagas.sagas_pending.is_empty());
425427
// The database claim we took is still held.
426428
assert!(!rv.db_claims.is_empty());
427429
Ok(())

nexus/types/src/internal_api/views.rs

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use crate::deployment::TargetReleaseDescription;
77
use crate::inventory::BaseboardId;
88
use crate::inventory::CabooseWhich;
99
use crate::inventory::Collection;
10+
use crate::quiesce::SagaQuiesceStatus;
1011
use chrono::DateTime;
1112
use chrono::SecondsFormat;
1213
use chrono::Utc;
@@ -978,12 +979,8 @@ pub struct QuiesceStatus {
978979
/// what stage of quiescing is Nexus at
979980
pub state: QuiesceState,
980981

981-
/// what sagas are currently running or known needing to be recovered
982-
///
983-
/// This should only be non-empty when state is `Running` or
984-
/// `WaitingForSagas`. Entries here prevent transitioning from
985-
/// `WaitingForSagas` to `WaitingForDb`.
986-
pub sagas_pending: IdOrdMap<PendingSagaInfo>,
982+
/// information about saga quiescing
983+
pub sagas: SagaQuiesceStatus,
987984

988985
/// what database claims are currently held (by any part of Nexus)
989986
///

0 commit comments

Comments
 (0)