Skip to content

Commit f423987

Browse files
authored
implement Nexus quiesce (sagas, db activity) for upgrade (#8740)
1 parent 0abb3f1 commit f423987

File tree

22 files changed

+1506
-42
lines changed

22 files changed

+1506
-42
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

clients/nexus-client/src/lib.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55
//! Interface for making API requests to the Oxide control plane at large
66
//! from within the control plane
77
8+
use iddqd::IdOrdItem;
9+
use iddqd::id_upcast;
810
use std::collections::HashMap;
11+
use uuid::Uuid;
912

1013
progenitor::generate_api!(
1114
spec = "../../openapi/nexus-internal.json",
@@ -85,6 +88,26 @@ progenitor::generate_api!(
8588
}
8689
);
8790

91+
impl IdOrdItem for types::RunningSagaInfo {
92+
type Key<'a> = Uuid;
93+
94+
fn key(&self) -> Self::Key<'_> {
95+
self.saga_id
96+
}
97+
98+
id_upcast!();
99+
}
100+
101+
impl IdOrdItem for types::HeldDbClaimInfo {
102+
type Key<'a> = u64;
103+
104+
fn key(&self) -> Self::Key<'_> {
105+
self.id
106+
}
107+
108+
id_upcast!();
109+
}
110+
88111
impl omicron_common::api::external::ClientError for types::Error {
89112
fn message(&self) -> String {
90113
self.message.clone()

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
//! omdb commands that query or update specific Nexus instances
66
77
mod chicken_switches;
8+
mod quiesce;
89
mod update_status;
910

1011
use crate::Omdb;
@@ -79,6 +80,8 @@ use omicron_uuid_kinds::ParseError;
7980
use omicron_uuid_kinds::PhysicalDiskUuid;
8081
use omicron_uuid_kinds::SledUuid;
8182
use omicron_uuid_kinds::SupportBundleUuid;
83+
use quiesce::QuiesceArgs;
84+
use quiesce::cmd_nexus_quiesce;
8285
use serde::Deserialize;
8386
use slog_error_chain::InlineErrorChain;
8487
use std::collections::BTreeMap;
@@ -138,6 +141,8 @@ enum NexusCommands {
138141
MgsUpdates,
139142
/// interact with oximeter read policy
140143
OximeterReadPolicy(OximeterReadPolicyArgs),
144+
/// view or modify the quiesce status
145+
Quiesce(QuiesceArgs),
141146
/// view sagas, create and complete demo sagas
142147
Sagas(SagasArgs),
143148
/// interact with sleds
@@ -718,6 +723,10 @@ impl NexusArgs {
718723
}
719724
},
720725

726+
NexusCommands::Quiesce(args) => {
727+
cmd_nexus_quiesce(&omdb, &client, args).await
728+
}
729+
721730
NexusCommands::Sagas(SagasArgs { command }) => {
722731
if self.nexus_internal_url.is_none() {
723732
eprintln!(
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! omdb commands for managing Nexus quiesce state
6+
7+
use crate::Omdb;
8+
use crate::check_allow_destructive::DestructiveOperationToken;
9+
use anyhow::Context;
10+
use chrono::TimeDelta;
11+
use chrono::Utc;
12+
use clap::Args;
13+
use clap::Subcommand;
14+
use nexus_client::types::QuiesceState;
15+
use std::time::Duration;
16+
17+
#[derive(Debug, Args)]
18+
pub struct QuiesceArgs {
19+
#[command(subcommand)]
20+
command: QuiesceCommands,
21+
}
22+
23+
#[derive(Debug, Subcommand)]
24+
pub enum QuiesceCommands {
25+
/// Show the current Nexus quiesce status
26+
Show(QuiesceShowArgs),
27+
28+
/// Start quiescing Nexus
29+
Start,
30+
}
31+
32+
#[derive(Debug, Args)]
33+
pub struct QuiesceShowArgs {
34+
/// Show details about held database connections
35+
#[clap(short, long, default_value_t = false)]
36+
verbose: bool,
37+
}
38+
39+
pub async fn cmd_nexus_quiesce(
40+
omdb: &Omdb,
41+
client: &nexus_client::Client,
42+
args: &QuiesceArgs,
43+
) -> Result<(), anyhow::Error> {
44+
match &args.command {
45+
QuiesceCommands::Show(args) => quiesce_show(&client, args).await,
46+
QuiesceCommands::Start => {
47+
let token = omdb.check_allow_destructive()?;
48+
quiesce_start(&client, token).await
49+
}
50+
}
51+
}
52+
53+
async fn quiesce_show(
54+
client: &nexus_client::Client,
55+
args: &QuiesceShowArgs,
56+
) -> Result<(), anyhow::Error> {
57+
let now = Utc::now();
58+
let quiesce = client
59+
.quiesce_get()
60+
.await
61+
.context("fetching quiesce state")?
62+
.into_inner();
63+
match quiesce.state {
64+
QuiesceState::Running => {
65+
println!("running normally (not quiesced, not quiescing)");
66+
}
67+
QuiesceState::WaitingForSagas { time_requested } => {
68+
println!(
69+
"quiescing since {} ({} ago)",
70+
humantime::format_rfc3339_millis(time_requested.into()),
71+
format_time_delta(now - time_requested),
72+
);
73+
println!("details: waiting for running sagas to finish");
74+
}
75+
QuiesceState::WaitingForDb {
76+
time_requested,
77+
duration_waiting_for_sagas,
78+
..
79+
} => {
80+
println!(
81+
"quiescing since {} ({} ago)",
82+
humantime::format_rfc3339_millis(time_requested.into()),
83+
format_time_delta(now - time_requested),
84+
);
85+
println!(
86+
"details: waiting for database connections to be released"
87+
);
88+
println!(
89+
" previously: waiting for sagas took {}",
90+
format_duration_ms(duration_waiting_for_sagas.into()),
91+
);
92+
}
93+
QuiesceState::Quiesced {
94+
time_quiesced,
95+
duration_waiting_for_sagas,
96+
duration_waiting_for_db,
97+
duration_total,
98+
..
99+
} => {
100+
println!(
101+
"quiesced since {} ({} ago)",
102+
humantime::format_rfc3339_millis(time_quiesced.into()),
103+
format_time_delta(now - time_quiesced),
104+
);
105+
println!(
106+
" waiting for sagas took {}",
107+
format_duration_ms(duration_waiting_for_sagas.into()),
108+
);
109+
println!(
110+
" waiting for db quiesce took {}",
111+
format_duration_ms(duration_waiting_for_db.into()),
112+
);
113+
println!(
114+
" total quiesce time: {}",
115+
format_duration_ms(duration_total.into()),
116+
);
117+
}
118+
}
119+
120+
println!("sagas running: {}", quiesce.sagas_running.len());
121+
for saga in &quiesce.sagas_running {
122+
println!(
123+
" saga {} started at {} ({})",
124+
saga.saga_id,
125+
humantime::format_rfc3339_millis(saga.time_started.into()),
126+
saga.saga_name
127+
);
128+
}
129+
130+
println!("database connections held: {}", quiesce.db_claims.len());
131+
for claim in &quiesce.db_claims {
132+
println!(
133+
" claim {} held since {} ({} ago)",
134+
claim.id,
135+
claim.held_since,
136+
format_time_delta(Utc::now() - claim.held_since),
137+
);
138+
if args.verbose {
139+
println!(" acquired by:");
140+
println!("{}", textwrap::indent(&claim.debug, " "));
141+
}
142+
}
143+
144+
Ok(())
145+
}
146+
147+
async fn quiesce_start(
148+
client: &nexus_client::Client,
149+
_token: DestructiveOperationToken,
150+
) -> Result<(), anyhow::Error> {
151+
client.quiesce_start().await.context("quiescing Nexus")?;
152+
quiesce_show(client, &QuiesceShowArgs { verbose: false }).await
153+
}
154+
155+
fn format_duration_ms(duration: Duration) -> String {
156+
// Ignore units smaller than a millisecond.
157+
let elapsed = Duration::from_millis(
158+
u64::try_from(duration.as_millis()).unwrap_or(u64::MAX),
159+
);
160+
humantime::format_duration(elapsed).to_string()
161+
}
162+
163+
fn format_time_delta(time_delta: TimeDelta) -> String {
164+
match time_delta.to_std() {
165+
Ok(d) => format_duration_ms(d),
166+
Err(_) => String::from("<time delta out of range>"),
167+
}
168+
}

dev-tools/omdb/tests/usage_errors.out

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -849,6 +849,7 @@ Commands:
849849
clickhouse-policy interact with clickhouse policy
850850
mgs-updates print information about pending MGS updates
851851
oximeter-read-policy interact with oximeter read policy
852+
quiesce view or modify the quiesce status
852853
sagas view sagas, create and complete demo sagas
853854
sleds interact with sleds
854855
support-bundles interact with support bundles [aliases: sb]

nexus/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ http.workspace = true
4545
http-body-util.workspace = true
4646
hyper.workspace = true
4747
hyper-staticfile.workspace = true
48+
iddqd.workspace = true
4849
id-map.workspace = true
4950
illumos-utils.workspace = true
5051
internal-dns-resolver.workspace = true

nexus/auth/src/authz/api_resources.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,50 @@ impl ApiResourceWithRolesType for Fleet {
239239
type AllowedRoles = FleetRole;
240240
}
241241

242+
/// Represents the "quiesce" state of Nexus
243+
///
244+
/// It is essential that authorizing actions on this resource *not* access the
245+
/// database because we cannot do that while quiesced and we *do* want to be
246+
/// able to read and modify the quiesce state while quiesced.
247+
#[derive(Clone, Copy, Debug, Serialize, Deserialize, PolarClass)]
248+
pub struct QuiesceState;
249+
/// Singleton representing the [`QuiesceState`] itself for authz purposes
250+
pub const QUIESCE_STATE: QuiesceState = QuiesceState;
251+
252+
impl Eq for QuiesceState {}
253+
impl PartialEq for QuiesceState {
254+
fn eq(&self, _: &Self) -> bool {
255+
// There is only one QuiesceState
256+
true
257+
}
258+
}
259+
260+
impl AuthorizedResource for QuiesceState {
261+
fn load_roles<'fut>(
262+
&'fut self,
263+
_: &'fut OpContext,
264+
_: &'fut authn::Context,
265+
_: &'fut mut RoleSet,
266+
) -> BoxFuture<'fut, Result<(), Error>> {
267+
// We don't use (database) roles to grant access to the quiesce state.
268+
futures::future::ready(Ok(())).boxed()
269+
}
270+
271+
fn on_unauthorized(
272+
&self,
273+
_: &Authz,
274+
error: Error,
275+
_: AnyActor,
276+
_: Action,
277+
) -> Error {
278+
error
279+
}
280+
281+
fn polar_class(&self) -> oso::Class {
282+
Self::get_polar_class()
283+
}
284+
}
285+
242286
// TODO: refactor synthetic resources below
243287

244288
#[derive(Clone, Copy, Debug, PartialEq, Eq)]

nexus/auth/src/authz/omicron.polar

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,20 @@ has_relation(fleet: Fleet, "parent_fleet", collection: SamlIdentityProvider)
347347
# Fleet. None of these resources defines their own roles.
348348
#
349349

350+
# Describes the quiesce state of a particular Nexus instance.
351+
#
352+
# These authz checks must not require the database. We grant this directly to
353+
# callers of the internal API.
354+
resource QuiesceState {
355+
permissions = [ "read", "modify" ];
356+
}
357+
has_permission(USER_INTERNAL_API: AuthenticatedActor, "read", _q: QuiesceState);
358+
has_permission(
359+
USER_INTERNAL_API: AuthenticatedActor,
360+
"modify",
361+
_q: QuiesceState
362+
);
363+
350364
# Describes the policy for reading and modifying DNS configuration
351365
# (both internal and external)
352366
resource DnsConfig {

nexus/auth/src/authz/oso_generic.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ pub fn make_omicron_oso(log: &slog::Logger) -> Result<OsoInit, anyhow::Error> {
112112
IpPoolList::get_polar_class(),
113113
ConsoleSessionList::get_polar_class(),
114114
DeviceAuthRequestList::get_polar_class(),
115+
QuiesceState::get_polar_class(),
115116
SiloCertificateList::get_polar_class(),
116117
SiloIdentityProviderList::get_polar_class(),
117118
SiloUserList::get_polar_class(),

nexus/db-lookup/src/datastore_interface.rs

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use diesel::PgConnection;
88
use diesel_dtrace::DTraceConnection;
99
use nexus_auth::context::OpContext;
1010
use omicron_common::api::external::Error;
11+
use std::any::Any;
1112

1213
/// The interface between lookups and the Nexus datastore.
1314
#[async_trait::async_trait]
@@ -48,5 +49,40 @@ where
4849
// If a more natural location becomes available in the future, consider moving
4950
// these aliases there.
5051
pub type DbConnection = DTraceConnection<PgConnection>;
51-
pub type DataStoreConnection =
52-
qorb::claim::Handle<async_bb8_diesel::Connection<DbConnection>>;
52+
pub type AsyncConnection = async_bb8_diesel::Connection<DbConnection>;
53+
54+
pub struct DataStoreConnection {
55+
inner: qorb::claim::Handle<AsyncConnection>,
56+
57+
// `DataStoreConnection` is used by various packages that we'd like to not
58+
// depend on `nexus-db-queries` (in order to parallelize compilation).
59+
// However, we need to do some datastore-specific work around the lifecycle
60+
// of this object (i.e., when it gets instantiated and when it gets
61+
// dropped). To achieve this, the caller in `nexus-db-queries` provides a
62+
// `releaser` whose sole purpose is to be dropped when this object is
63+
// dropped, allowing it to do the needed cleanup there.
64+
#[allow(dead_code)]
65+
releaser: Box<dyn Any + Send + Sync + 'static>,
66+
}
67+
68+
impl DataStoreConnection {
69+
pub fn new(
70+
inner: qorb::claim::Handle<AsyncConnection>,
71+
releaser: Box<dyn Any + Send + Sync + 'static>,
72+
) -> DataStoreConnection {
73+
DataStoreConnection { inner, releaser }
74+
}
75+
}
76+
77+
impl std::ops::Deref for DataStoreConnection {
78+
type Target = AsyncConnection;
79+
fn deref(&self) -> &Self::Target {
80+
self.inner.deref()
81+
}
82+
}
83+
84+
impl std::ops::DerefMut for DataStoreConnection {
85+
fn deref_mut(&mut self) -> &mut Self::Target {
86+
self.inner.deref_mut()
87+
}
88+
}

0 commit comments

Comments
 (0)