Skip to content

Commit 0c0bdf4

Browse files
authored
Blueprint execution: Split up top-level cleanup implementation (#9512)
Before this change, the high-level structure of this was "for each ready-for-cleanup zone in the blueprint, match on its type and then maybe perform some cleanup action depending on the type". After this change, this is inverted: "for each zone type we care about, for each ready-for-cleanup zone of that type in the blueprint, perform the relevant cleanup action". On its face this is a little worse: we now loop over the zones in the blueprint twice (currently, because we have two zone types we care about for cleanup, although a third is implemented and compiled out for now) instead of once. The intent here is that we're going to attach a `reason` for why a given caller wants to act on expunged zones, and it's much clearer if it's broken out this way. For example, in future work we'll change this: ```rust let zones_to_clean_up = blueprint .all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) .filter(|(_sled_id, zone)| zone.zone_type.is_oximeter()); ``` to something like ```rust let zones_to_clean_up = blueprint.expunged_oximeter_zones_ready_for_cleanup( Reason::OximeterMetricProducerReassignment, ); ``` and it's harder to express some kind of "composite reason" if we were to keep the original structure.
1 parent 7cedcdb commit 0c0bdf4

File tree

2 files changed

+124
-107
lines changed

2 files changed

+124
-107
lines changed

nexus/reconfigurator/execution/src/omicron_zones.rs

Lines changed: 119 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,22 @@ use anyhow::Context;
88
use anyhow::bail;
99
use cockroach_admin_client::types::NodeDecommission;
1010
use cockroach_admin_client::types::NodeId;
11+
use futures::Stream;
1112
use futures::StreamExt;
13+
use futures::future::Either;
1214
use futures::stream;
1315
use internal_dns_resolver::Resolver;
1416
use internal_dns_types::names::ServiceName;
1517
use nexus_db_queries::context::OpContext;
1618
use nexus_db_queries::db::DataStore;
1719
use nexus_db_queries::db::datastore::CollectorReassignment;
1820
use nexus_types::deployment::Blueprint;
19-
use nexus_types::deployment::BlueprintZoneConfig;
2021
use nexus_types::deployment::BlueprintZoneDisposition;
21-
use nexus_types::deployment::BlueprintZoneType;
2222
use omicron_common::address::COCKROACH_ADMIN_PORT;
2323
use omicron_uuid_kinds::GenericUuid;
2424
use omicron_uuid_kinds::OmicronZoneUuid;
2525
use omicron_uuid_kinds::SledUuid;
26+
use sled_agent_types::inventory::ZoneKind;
2627
use slog::Logger;
2728
use slog::info;
2829
use slog::warn;
@@ -45,96 +46,116 @@ pub(crate) async fn clean_up_expunged_zones<R: CleanupResolver>(
4546
// cockroach-admin knows how to gate decommissioning correctly.
4647
let decommission_cockroach = false;
4748

48-
clean_up_expunged_zones_impl(
49-
opctx,
50-
datastore,
51-
resolver,
52-
decommission_cockroach,
53-
blueprint
54-
.all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup),
55-
)
56-
.await
57-
}
49+
let cockroach_cleanups = if decommission_cockroach {
50+
Either::Left(clean_up_expunged_cockroach_zones(
51+
opctx, datastore, resolver, blueprint,
52+
))
53+
} else {
54+
Either::Right(stream::empty())
55+
};
5856

59-
async fn clean_up_expunged_zones_impl<R: CleanupResolver>(
60-
opctx: &OpContext,
61-
datastore: &DataStore,
62-
resolver: &R,
63-
decommission_cockroach: bool,
64-
zones_to_clean_up: impl Iterator<Item = (SledUuid, &BlueprintZoneConfig)>,
65-
) -> Result<(), Vec<anyhow::Error>> {
66-
let errors: Vec<anyhow::Error> = stream::iter(zones_to_clean_up)
67-
.filter_map(async |(sled_id, config)| {
68-
let log = opctx.log.new(slog::o!(
69-
"sled_id" => sled_id.to_string(),
70-
"zone_id" => config.id.to_string(),
71-
"zone_type" => config.zone_type.kind().report_str(),
72-
));
73-
74-
let result = match &config.zone_type {
75-
// Zones which need cleanup after expungement.
76-
BlueprintZoneType::Nexus(_) => Some(
77-
datastore
78-
.database_nexus_access_delete(&opctx, config.id)
79-
.await
80-
.map_err(|err| anyhow::anyhow!(err)),
81-
),
82-
BlueprintZoneType::CockroachDb(_) => {
83-
if decommission_cockroach {
84-
Some(
85-
decommission_cockroachdb_node(
86-
opctx, datastore, resolver, config.id, &log,
87-
)
88-
.await,
89-
)
90-
} else {
91-
None
92-
}
93-
}
94-
BlueprintZoneType::Oximeter(_) => Some(
95-
oximeter_cleanup(opctx, datastore, config.id, &log).await,
96-
),
97-
98-
// Zones that may or may not need cleanup work - we haven't
99-
// gotten to these yet!
100-
BlueprintZoneType::BoundaryNtp(_)
101-
| BlueprintZoneType::Clickhouse(_)
102-
| BlueprintZoneType::ClickhouseKeeper(_)
103-
| BlueprintZoneType::ClickhouseServer(_)
104-
| BlueprintZoneType::Crucible(_)
105-
| BlueprintZoneType::CruciblePantry(_)
106-
| BlueprintZoneType::ExternalDns(_)
107-
| BlueprintZoneType::InternalDns(_)
108-
| BlueprintZoneType::InternalNtp(_) => {
109-
warn!(
110-
log,
111-
"unsupported zone type for expungement cleanup; \
112-
not performing any cleanup";
113-
);
114-
None
115-
}
116-
}?;
117-
118-
match result {
119-
Err(error) => {
120-
warn!(
121-
log, "failed to clean up expunged zone";
122-
"error" => #%error,
123-
);
124-
Some(error)
125-
}
126-
Ok(()) => {
127-
info!(log, "successfully cleaned up after expunged zone");
128-
None
129-
}
57+
let errors = cockroach_cleanups
58+
.chain(clean_up_expunged_nexus_zones(opctx, datastore, blueprint))
59+
.chain(clean_up_expunged_oximeter_zones(opctx, datastore, blueprint))
60+
.filter_map(async |(log, result)| match result {
61+
Ok(()) => {
62+
info!(log, "successfully cleaned up after expunged zone");
63+
None
64+
}
65+
Err(error) => {
66+
warn!(
67+
log, "failed to clean up expunged zone";
68+
InlineErrorChain::new(&*error),
69+
);
70+
Some(error)
13071
}
13172
})
132-
.collect()
73+
.collect::<Vec<_>>()
13374
.await;
13475

13576
if errors.is_empty() { Ok(()) } else { Err(errors) }
13677
}
13778

79+
fn make_cleanup_logger(
80+
opctx: &OpContext,
81+
sled_id: SledUuid,
82+
zone_id: OmicronZoneUuid,
83+
zone_kind: ZoneKind,
84+
) -> Logger {
85+
opctx.log.new(slog::o!(
86+
"sled_id" => sled_id.to_string(),
87+
"zone_id" => zone_id.to_string(),
88+
"zone_type" => zone_kind.report_str(),
89+
))
90+
}
91+
92+
fn clean_up_expunged_nexus_zones<'a>(
93+
opctx: &'a OpContext,
94+
datastore: &'a DataStore,
95+
blueprint: &'a Blueprint,
96+
) -> impl Stream<Item = (Logger, anyhow::Result<()>)> + 'a {
97+
let zones_to_clean_up = blueprint
98+
.all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup)
99+
.filter(|(_sled_id, zone)| zone.zone_type.is_nexus());
100+
101+
stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move {
102+
let zone_kind = zone.zone_type.kind();
103+
assert_eq!(zone_kind, ZoneKind::Nexus);
104+
105+
let log = make_cleanup_logger(opctx, sled_id, zone.id, zone_kind);
106+
let result = datastore
107+
.database_nexus_access_delete(&opctx, zone.id)
108+
.await
109+
.map_err(|err| anyhow::anyhow!(err));
110+
111+
(log, result)
112+
})
113+
}
114+
115+
fn clean_up_expunged_cockroach_zones<'a, R: CleanupResolver>(
116+
opctx: &'a OpContext,
117+
datastore: &'a DataStore,
118+
resolver: &'a R,
119+
blueprint: &'a Blueprint,
120+
) -> impl Stream<Item = (Logger, anyhow::Result<()>)> + 'a {
121+
let zones_to_clean_up = blueprint
122+
.all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup)
123+
.filter(|(_sled_id, zone)| zone.zone_type.is_cockroach());
124+
125+
stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move {
126+
let zone_kind = zone.zone_type.kind();
127+
assert_eq!(zone_kind, ZoneKind::CockroachDb);
128+
129+
let log = make_cleanup_logger(opctx, sled_id, zone.id, zone_kind);
130+
let result = decommission_cockroachdb_node(
131+
opctx, datastore, resolver, zone.id, &log,
132+
)
133+
.await;
134+
135+
(log, result)
136+
})
137+
}
138+
139+
fn clean_up_expunged_oximeter_zones<'a>(
140+
opctx: &'a OpContext,
141+
datastore: &'a DataStore,
142+
blueprint: &'a Blueprint,
143+
) -> impl Stream<Item = (Logger, anyhow::Result<()>)> + 'a {
144+
let zones_to_clean_up = blueprint
145+
.all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup)
146+
.filter(|(_sled_id, zone)| zone.zone_type.is_oximeter());
147+
148+
stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move {
149+
let zone_kind = zone.zone_type.kind();
150+
assert_eq!(zone_kind, ZoneKind::Oximeter);
151+
152+
let log = make_cleanup_logger(opctx, sled_id, zone.id, zone_kind);
153+
let result = oximeter_cleanup(opctx, datastore, zone.id, &log).await;
154+
155+
(log, result)
156+
})
157+
}
158+
138159
async fn oximeter_cleanup(
139160
opctx: &OpContext,
140161
datastore: &DataStore,
@@ -307,15 +328,14 @@ mod test {
307328
use httptest::responders::{json_encoded, status_code};
308329
use nexus_test_utils_macros::nexus_test;
309330
use nexus_types::deployment::{
310-
BlueprintZoneImageSource, blueprint_zone_type,
331+
BlueprintZoneConfig, BlueprintZoneImageSource, BlueprintZoneType,
332+
blueprint_zone_type,
311333
};
312334
use omicron_common::api::external::Generation;
313335
use omicron_common::zpool_name::ZpoolName;
314336
use omicron_uuid_kinds::OmicronZoneUuid;
315-
use omicron_uuid_kinds::SledUuid;
316337
use omicron_uuid_kinds::ZpoolUuid;
317338
use sled_agent_types::inventory::OmicronZoneDataset;
318-
use std::iter;
319339
use uuid::Uuid;
320340

321341
type ControlPlaneTestContext =
@@ -325,11 +345,6 @@ mod test {
325345
async fn test_clean_up_cockroach_zones(
326346
cptestctx: &ControlPlaneTestContext,
327347
) {
328-
// The whole point of this test is to check that we send decommissioning
329-
// requests; enable that. (See the real `clean_up_expunged_zones()` for
330-
// more context.)
331-
let decommission_cockroach = true;
332-
333348
// Test setup boilerplate.
334349
let nexus = &cptestctx.server.server_context().nexus;
335350
let datastore = nexus.datastore();
@@ -339,7 +354,6 @@ mod test {
339354
);
340355

341356
// Construct the cockroach zone we're going to try to clean up.
342-
let any_sled_id = SledUuid::new_v4();
343357
let crdb_zone = BlueprintZoneConfig {
344358
disposition: BlueprintZoneDisposition::Expunged {
345359
as_of_generation: Generation::new(),
@@ -379,12 +393,12 @@ mod test {
379393
// otherwise succeed, without attempting to contact our mock admin
380394
// server. (We don't have a good way to confirm the warning was logged,
381395
// so we'll just check for an Ok return and no contact to mock_admin.)
382-
clean_up_expunged_zones_impl(
396+
decommission_cockroachdb_node(
383397
&opctx,
384398
datastore,
385399
&fake_resolver,
386-
decommission_cockroach,
387-
iter::once((any_sled_id, &crdb_zone)),
400+
crdb_zone.id,
401+
&opctx.log,
388402
)
389403
.await
390404
.expect("unknown node ID: no cleanup");
@@ -426,12 +440,12 @@ mod test {
426440
);
427441
};
428442
add_decommission_expecation(&mut mock_admin);
429-
clean_up_expunged_zones_impl(
443+
decommission_cockroachdb_node(
430444
&opctx,
431445
datastore,
432446
&fake_resolver,
433-
decommission_cockroach,
434-
iter::once((any_sled_id, &crdb_zone)),
447+
crdb_zone.id,
448+
&opctx.log,
435449
)
436450
.await
437451
.expect("decommissioned test node");
@@ -458,17 +472,15 @@ mod test {
458472
add_decommission_failure_expecation(&mut mock_bad2);
459473
let mut fake_resolver =
460474
FixedResolver(vec![mock_bad1.addr(), mock_bad2.addr()]);
461-
let mut err = clean_up_expunged_zones_impl(
475+
let err = decommission_cockroachdb_node(
462476
&opctx,
463477
datastore,
464478
&fake_resolver,
465-
decommission_cockroach,
466-
iter::once((any_sled_id, &crdb_zone)),
479+
crdb_zone.id,
480+
&opctx.log,
467481
)
468482
.await
469483
.expect_err("no successful response should result in failure");
470-
assert_eq!(err.len(), 1);
471-
let err = err.pop().unwrap();
472484
assert_eq!(
473485
err.to_string(),
474486
format!(
@@ -487,12 +499,12 @@ mod test {
487499
add_decommission_failure_expecation(&mut mock_bad2);
488500
add_decommission_expecation(&mut mock_admin);
489501
fake_resolver.0.push(mock_admin.addr());
490-
clean_up_expunged_zones_impl(
502+
decommission_cockroachdb_node(
491503
&opctx,
492504
datastore,
493505
&fake_resolver,
494-
decommission_cockroach,
495-
iter::once((any_sled_id, &crdb_zone)),
506+
crdb_zone.id,
507+
&opctx.log,
496508
)
497509
.await
498510
.expect("decommissioned test node");

nexus/types/src/deployment/zone_type.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,11 @@ impl BlueprintZoneType {
202202
matches!(self, BlueprintZoneType::Clickhouse(_))
203203
}
204204

205+
/// Identifies whether this is an Oximeter zone
206+
pub fn is_oximeter(&self) -> bool {
207+
matches!(self, BlueprintZoneType::Oximeter(_))
208+
}
209+
205210
/// Returns the durable dataset associated with this zone, if any exists.
206211
pub fn durable_dataset(&self) -> Option<DurableDataset<'_>> {
207212
let (dataset, kind) = match self {

0 commit comments

Comments
 (0)