diff --git a/dev-tools/omdb/src/bin/omdb/db/blueprints.rs b/dev-tools/omdb/src/bin/omdb/db/blueprints.rs index a4ee372bc02..8e1e0e2a125 100644 --- a/dev-tools/omdb/src/bin/omdb/db/blueprints.rs +++ b/dev-tools/omdb/src/bin/omdb/db/blueprints.rs @@ -232,5 +232,13 @@ async fn cmd_db_blueprint_planner_report_show( println!("planner report for blueprint {blueprint_id}:"); println!("{report}"); + let operator_notes = report.operator_notes().into_notes(); + if !operator_notes.is_empty() { + println!("\nnotes for customer operator:"); + for note in operator_notes { + println!(" * {note}"); + } + } + Ok(()) } diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index ba395aff074..a851b68683b 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -1309,6 +1309,14 @@ fn print_task_blueprint_planner(details: &serde_json::Value) { but could not make it the target: {error}" ); println!("{report}"); + + let operator_notes = report.operator_notes().into_notes(); + if !operator_notes.is_empty() { + println!("\nnotes for customer operator:"); + for note in operator_notes { + println!(" * {note}"); + } + } } BlueprintPlannerStatus::Targeted { blueprint_id, report, .. } => { println!( diff --git a/dev-tools/reconfigurator-cli/src/lib.rs b/dev-tools/reconfigurator-cli/src/lib.rs index 5d03257a923..63519a849f3 100644 --- a/dev-tools/reconfigurator-cli/src/lib.rs +++ b/dev-tools/reconfigurator-cli/src/lib.rs @@ -2106,9 +2106,34 @@ fn cmd_blueprint_plan( .context("creating planner")?; let blueprint = planner.plan().context("generating blueprint")?; + let operator_notes = match &blueprint.source { + BlueprintSource::Planner(report) => { + let mut notes = report + .operator_notes() + .into_notes() + .into_iter() + .map(|note| format!("\n * {note}")) + .peekable(); + if notes.peek().is_some() { + format!( + "\n\nnotes for customer operator:{}", + notes.collect::() + ) + } else { + String::new() + } + } + BlueprintSource::Rss + | BlueprintSource::PlannerLoadedFromDatabase + | BlueprintSource::ReconfiguratorCliEdit + | BlueprintSource::Test => unreachable!( + "unexpected blueprint source {} (just ran planner!)", + blueprint.source + ), + }; let rv = format!( "generated blueprint {} based on parent blueprint {}\n\ - blueprint source: {}", + blueprint source: {}{operator_notes}", blueprint.id, parent_blueprint.id, blueprint.source, ); system.add_blueprint(blueprint)?; diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout index 67bc81ec0a2..586d762b57f 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout @@ -53,6 +53,9 @@ planning report: +notes for customer operator: + * 1 sled with no available disks to host NTP service + > blueprint-show 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 parent: dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-add-zones-with-mupdate-override-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-add-zones-with-mupdate-override-stdout index 040173296fc..11387a7bf00 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-add-zones-with-mupdate-override-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-add-zones-with-mupdate-override-stdout @@ -95,7 +95,7 @@ INFO BootPartitionDetails inventory hash not found in TUF repo, ignoring for noo generated blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 based on parent blueprint dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 blueprint source: planner with report: planning report: -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (2) is lower than minimum required by blueprint (3) - sleds have remove mupdate override set in blueprint: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 @@ -109,6 +109,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > # Diff the blueprints. This diff should show "will remove mupdate override" > # and the target release minimum generation being set. @@ -220,7 +223,7 @@ INFO BootPartitionDetails inventory hash not found in TUF repo, ignoring for noo generated blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 based on parent blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 blueprint source: planner with report: planning report: -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (2) is lower than minimum required by blueprint (3) - sleds have remove mupdate override set in blueprint: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 @@ -234,6 +237,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > blueprint-diff latest from: blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 to: blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 @@ -301,6 +307,9 @@ planner config: +notes for customer operator: + * support override configured: internal services can be placed even if some versions are unknown due to sled recovery operations + > blueprint-diff latest from: blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 to: blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout index 9d953c1aeb7..d15c0d482cc 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout @@ -868,6 +868,9 @@ planner config: +notes for customer operator: + * support override configured: internal services can be placed even if some versions are unknown due to sled recovery operations + > blueprint-diff 366b0b68-d80e-4bc1-abd3-dc69837847e0 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 from: blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0 to: blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout index 51ac8e590b6..53fb87f34ed 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout @@ -505,7 +505,7 @@ INFO skipped noop image source check on sled, sled_id: d81c6a84-79b8-4958-ae41-e generated blueprint a5a8f242-ffa5-473c-8efd-2acf2dc0b736 based on parent blueprint d60afc57-f15d-476c-bd0f-b1071e2bb976 blueprint source: planner with report: planning report: -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (2) is lower than minimum required by blueprint (3) - sleds have remove mupdate override set in blueprint: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, d81c6a84-79b8-4958-ae41-ea46c9b19763 @@ -520,6 +520,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > # Diff the blueprints. This diff should show: > # @@ -787,7 +790,7 @@ INFO skipped noop image source check on sled, sled_id: d81c6a84-79b8-4958-ae41-e generated blueprint 626487fa-7139-45ec-8416-902271fc730b based on parent blueprint a5a8f242-ffa5-473c-8efd-2acf2dc0b736 blueprint source: planner with report: planning report: -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - sleds have remove mupdate override set in blueprint: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, d81c6a84-79b8-4958-ae41-ea46c9b19763 - sleds have mupdate override errors: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c @@ -801,6 +804,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > blueprint-diff latest from: blueprint a5a8f242-ffa5-473c-8efd-2acf2dc0b736 to: blueprint 626487fa-7139-45ec-8416-902271fc730b @@ -920,7 +926,7 @@ generated blueprint c1a0d242-9160-40f4-96ae-61f8f40a0b1b based on parent bluepri blueprint source: planner with report: planning report: * noop converting 6/6 install-dataset zones to artifact store on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (3) is lower than minimum required by blueprint (4) - sleds have remove mupdate override set in blueprint: d81c6a84-79b8-4958-ae41-ea46c9b19763 @@ -934,6 +940,10 @@ planning report: +notes for customer operator: + * across 1 sled, converted 6/6 service zones to known versions + * service updates are waiting for system to recover from support-driven update (mupdate) + > # Diff the blueprints. This diff should show: > # * on sled 0: @@ -1110,7 +1120,7 @@ generated blueprint afb09faf-a586-4483-9289-04d4f1d8ba23 based on parent bluepri blueprint source: planner with report: planning report: * skipping noop zone image source check on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6: all 6 zones are already from artifacts -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (3) is lower than minimum required by blueprint (4) - sleds have mupdate override errors: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c @@ -1123,6 +1133,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > blueprint-show latest blueprint afb09faf-a586-4483-9289-04d4f1d8ba23 parent: c1a0d242-9160-40f4-96ae-61f8f40a0b1b @@ -1292,7 +1305,7 @@ parent: c1a0d242-9160-40f4-96ae-61f8f40a0b1b blueprint source: planner with report: planning report: * skipping noop zone image source check on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6: all 6 zones are already from artifacts -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (3) is lower than minimum required by blueprint (4) - sleds have mupdate override errors: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c @@ -1440,7 +1453,7 @@ blueprint source: planner with report: planning report: * skipping noop zone image source check on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6: all 6 zones are already from artifacts * noop converting 6/6 install-dataset zones to artifact store on sled d81c6a84-79b8-4958-ae41-ea46c9b19763 -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - sleds have mupdate override errors: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c - sleds have deployment units with image sources not set to Artifact: @@ -1451,6 +1464,10 @@ planning report: +notes for customer operator: + * across 1 sled, converted 6/6 service zones to known versions + * service updates are waiting for system to recover from support-driven update (mupdate) + > blueprint-show latest blueprint ce365dff-2cdb-4f35-a186-b15e20e1e700 parent: afb09faf-a586-4483-9289-04d4f1d8ba23 @@ -1621,7 +1638,7 @@ blueprint source: planner with report: planning report: * skipping noop zone image source check on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6: all 6 zones are already from artifacts * noop converting 6/6 install-dataset zones to artifact store on sled d81c6a84-79b8-4958-ae41-ea46c9b19763 -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - sleds have mupdate override errors: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c - sleds have deployment units with image sources not set to Artifact: @@ -1740,7 +1757,7 @@ blueprint source: planner with report: planning report: * skipping noop zone image source check on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6: all 6 zones are already from artifacts * skipping noop zone image source check on sled d81c6a84-79b8-4958-ae41-ea46c9b19763: all 6 zones are already from artifacts -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - sleds have deployment units with image sources not set to Artifact: - sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c: 7 zones @@ -1750,6 +1767,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > blueprint-diff latest from: blueprint ce365dff-2cdb-4f35-a186-b15e20e1e700 to: blueprint 8f2d1f39-7c88-4701-aa43-56bf281b28c1 @@ -1818,6 +1838,10 @@ planning report: +notes for customer operator: + * across 1 sled, converted 7/7 service zones to known versions + * out of eligible sleds: only placed 0 services out of 1 desired + > blueprint-show latest blueprint 12d602a6-5ab4-487a-b94e-eb30cdf30300 parent: 8f2d1f39-7c88-4701-aa43-56bf281b28c1 @@ -2191,6 +2215,10 @@ planning report: +notes for customer operator: + * across 3 sleds, converted 3 OS images to known versions + * out of eligible sleds: only placed 0 services out of 1 desired + > blueprint-diff latest from: blueprint 12d602a6-5ab4-487a-b94e-eb30cdf30300 to: blueprint 61a93ea3-c872-48e0-aace-e86b0c52b839 @@ -2423,7 +2451,7 @@ planning report: * skipping noop zone image source check on sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c: all 7 zones are already from artifacts * skipping noop zone image source check on sled c3bc4c6d-fdde-4fc4-8493-89d2a1e5ee6b: all 0 zones are already from artifacts * skipping noop zone image source check on sled d81c6a84-79b8-4958-ae41-ea46c9b19763: all 6 zones are already from artifacts -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (4) is lower than minimum required by blueprint (5) - sleds have remove mupdate override set in blueprint: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 @@ -2435,6 +2463,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > blueprint-diff latest from: blueprint 61a93ea3-c872-48e0-aace-e86b0c52b839 to: blueprint 27e755bc-dc10-4647-853c-f89bb3a15a2c @@ -2573,6 +2604,10 @@ planner config: +notes for customer operator: + * support override configured: internal services can be placed even if some versions are unknown due to sled recovery operations + * out of eligible sleds: only placed 0 services out of 1 desired + > blueprint-diff latest from: blueprint 27e755bc-dc10-4647-853c-f89bb3a15a2c to: blueprint 9f89efdf-a23e-4137-b7cc-79f4a91cbe1f diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout index 203e65f87ab..444df193911 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout @@ -185,7 +185,7 @@ blueprint source: planner with report: planning report: * noop converting 6/6 install-dataset zones to artifact store on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 * noop converting 5/6 install-dataset zones to artifact store on sled aff6c093-197d-42c5-ad80-9f10ba051a34 -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - sleds have remove mupdate override set in blueprint: d81c6a84-79b8-4958-ae41-ea46c9b19763 - sleds have deployment units with image sources not set to Artifact: @@ -200,6 +200,10 @@ planning report: +notes for customer operator: + * across 2 sleds, converted 11/12 service zones to known versions + * service updates are waiting for system to recover from support-driven update (mupdate) + > # This diff should show expected changes to the blueprint. > blueprint-diff latest @@ -433,7 +437,7 @@ blueprint source: planner with report: planning report: * skipping noop zone image source check on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6: all 6 zones are already from artifacts * noop converting 2/2 install-dataset zones to artifact store on sled e96e226f-4ed9-4c01-91b9-69a9cd076c9e -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - sleds have remove mupdate override set in blueprint: d81c6a84-79b8-4958-ae41-ea46c9b19763 - sleds have deployment units with image sources not set to Artifact: @@ -447,6 +451,10 @@ planning report: +notes for customer operator: + * across 1 sled, converted 2/2 service zones to known versions + * service updates are waiting for system to recover from support-driven update (mupdate) + > # This diff should show changes to the sled that's back in inventory. > blueprint-diff latest diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout index 46533ac676f..0475de80881 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout @@ -152,6 +152,9 @@ planning report: +notes for customer operator: + * across 3 sleds, converted 25/25 service zones to known versions + > blueprint-diff latest from: blueprint dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 to: blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index c9195170ba0..6bf6052ff72 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -259,7 +259,7 @@ impl<'a> Planner<'a> { { self.do_plan_add(&mgs_updates)? } else { - PlanningAddStepReport::waiting_on(ZoneAddWaitingOn::Blockers) + PlanningAddStepReport::waiting_on(ZoneAddWaitingOn::MupdateBlockers) }; add.add_update_blocked_reasons = add_update_blocked_reasons; add.add_zones_with_mupdate_override = add_zones_with_mupdate_override; @@ -536,7 +536,11 @@ impl<'a> Planner<'a> { // have been added and then expunged since our // parent blueprint was created). We don't want to // fail in this case, but will report it. - report.orphan_disks.insert(sled_id, disk.disk_id); + report + .orphan_disks + .entry(sled_id) + .or_default() + .push(disk.disk_id); } Err(err) => return Err(err), } diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index f435955bf3a..adac2327399 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -131,6 +131,7 @@ pub use planning_report::PlanningNoopImageSourceSkipSledHostPhase2Reason; pub use planning_report::PlanningNoopImageSourceSkipSledZonesReason; pub use planning_report::PlanningNoopImageSourceSkipZoneReason; pub use planning_report::PlanningNoopImageSourceStepReport; +pub use planning_report::PlanningOperatorNotes; pub use planning_report::PlanningReport; pub use planning_report::PlanningZoneUpdatesStepReport; pub use planning_report::ZoneAddWaitingOn; diff --git a/nexus/types/src/deployment/planning_report.rs b/nexus/types/src/deployment/planning_report.rs index 6cee384b3b1..eb524c6b241 100644 --- a/nexus/types/src/deployment/planning_report.rs +++ b/nexus/types/src/deployment/planning_report.rs @@ -37,6 +37,10 @@ use std::fmt::Write; use std::sync::Arc; use thiserror::Error; +mod operator_notes; + +pub use operator_notes::PlanningOperatorNotes; + /// A full blueprint planning report. Other than the blueprint ID, each /// field corresponds to a step in the update planner, i.e., a subroutine /// of `omicron_nexus::reconfigurator::planning::Planner::do_plan`. @@ -89,14 +93,33 @@ impl PlanningReport { } pub fn is_empty(&self) -> bool { - self.expunge.is_empty() - && self.decommission.is_empty() - && self.noop_image_source.is_empty() - && self.mgs_updates.is_empty() - && self.add.is_empty() - && self.zone_updates.is_empty() - && self.nexus_generation_bump.is_empty() - && self.cockroachdb_settings.is_empty() + let Self { + expunge, + decommission, + noop_image_source, + mgs_updates, + add, + zone_updates, + nexus_generation_bump, + cockroachdb_settings, + // the config is not relevant to whether we have content + planner_config: _, + } = self; + + expunge.is_empty() + && decommission.is_empty() + && noop_image_source.is_empty() + && mgs_updates.is_empty() + && add.is_empty() + && zone_updates.is_empty() + && nexus_generation_bump.is_empty() + && cockroachdb_settings.is_empty() + } + + /// Returns a set of notes condensed from this report that are suitable for + /// display to a rack operator (i.e., not internal debugging). + pub fn operator_notes(&self) -> PlanningOperatorNotes { + PlanningOperatorNotes::new(self) } } @@ -139,7 +162,7 @@ impl fmt::Display for PlanningReport { #[cfg_attr(test, derive(test_strategy::Arbitrary))] pub struct PlanningExpungeStepReport { /// Expunged disks not present in the parent blueprint. - pub orphan_disks: BTreeMap, + pub orphan_disks: BTreeMap>, } impl PlanningExpungeStepReport { @@ -161,8 +184,11 @@ impl fmt::Display for PlanningExpungeStepReport { "* planning input contained expunged disks \ not present in parent blueprint:", )?; - for (sled, disk) in orphan_disks.iter() { - writeln!(f, " * sled {sled}, disk {disk}",)?; + for (sled, disks) in orphan_disks.iter() { + writeln!(f, " * sled {sled}:")?; + for disk in disks { + writeln!(f, " * disk {disk}")?; + } } } Ok(()) @@ -675,13 +701,13 @@ pub struct DiscretionaryZonePlacement { pub enum ZoneAddWaitingOn { /// Waiting on one or more blockers (typically MUPdate-related reasons) to /// clear. - Blockers, + MupdateBlockers, } impl ZoneAddWaitingOn { pub fn as_str(&self) -> &'static str { match self { - Self::Blockers => "blockers", + Self::MupdateBlockers => "mupdate blockers", } } } diff --git a/nexus/types/src/deployment/planning_report/operator_notes.rs b/nexus/types/src/deployment/planning_report/operator_notes.rs new file mode 100644 index 00000000000..267ce873136 --- /dev/null +++ b/nexus/types/src/deployment/planning_report/operator_notes.rs @@ -0,0 +1,462 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Summarizing a [`PlanningReport`] for customer operators. + +use super::ZoneAddWaitingOn; +use super::ZoneUnsafeToShutdown; +use crate::deployment::PlannerConfig; +use crate::deployment::PlanningAddStepReport; +use crate::deployment::PlanningDecommissionStepReport; +use crate::deployment::PlanningExpungeStepReport; +use crate::deployment::PlanningMgsUpdatesStepReport; +use crate::deployment::PlanningNoopImageSourceSkipSledHostPhase2Reason; +use crate::deployment::PlanningNoopImageSourceSkipSledZonesReason; +use crate::deployment::PlanningNoopImageSourceSkipZoneReason; +use crate::deployment::PlanningNoopImageSourceStepReport; +use crate::deployment::PlanningReport; +use crate::deployment::PlanningZoneUpdatesStepReport; +use crate::deployment::planning_report::PlanningAddOutOfEligibleSleds; +use crate::deployment::planning_report::PlanningNoopImageSourceConverted; +use slog_error_chain::InlineErrorChain; + +#[derive(Debug, Clone)] +pub struct PlanningOperatorNotes { + notes: Vec, +} + +impl PlanningOperatorNotes { + pub(super) fn new(report: &PlanningReport) -> Self { + let PlanningReport { + planner_config, + expunge, + decommission, + noop_image_source, + mgs_updates, + add, + zone_updates, + // nexus generation bump is a normal part of the end of an update; + // there's nothing an operator can act on here. + nexus_generation_bump: _, + // cockroach cluster settings are not operator-relevant + cockroachdb_settings: _, + } = report; + + let mut me = Self { notes: Vec::new() }; + + me.push_planner_config_notes(planner_config); + me.push_expunge_notes(expunge); + me.push_decommission_notes(decommission); + me.push_noop_image_source_notes(noop_image_source); + me.push_mgs_updates_notes(mgs_updates); + me.push_add_notes(add); + me.push_zone_updates_notes(zone_updates); + + me + } + + pub fn into_notes(self) -> Vec { + self.notes + } + + fn push_planner_config_notes(&mut self, config: &PlannerConfig) { + let PlannerConfig { add_zones_with_mupdate_override } = config; + if *add_zones_with_mupdate_override { + self.notes.push( + "support override configured: internal services can be \ + placed even if some versions are unknown due to sled \ + recovery operations" + .to_string(), + ); + } + } + + fn push_expunge_notes(&mut self, expunge: &PlanningExpungeStepReport) { + let PlanningExpungeStepReport { orphan_disks } = expunge; + + let mut nsleds = 0; + let mut ndisks = 0; + for orphans in orphan_disks.values() { + nsleds += 1; + ndisks += orphans.len(); + } + if nsleds > 0 { + self.notes.push(format!( + "unexpectedly found {ndisks} orphaned disk{} \ + across {nsleds} sled{}", + pluralize_s(ndisks), + pluralize_s(nsleds) + )); + } + } + + fn push_decommission_notes( + &mut self, + decommission: &PlanningDecommissionStepReport, + ) { + let PlanningDecommissionStepReport { zombie_sleds } = decommission; + let nsleds = zombie_sleds.len(); + if nsleds > 0 { + self.notes.push(format!( + "unexpectedly found {nsleds} sled{} that \ + have been decommissioned", + pluralize_s(nsleds), + )); + } + } + + fn push_noop_image_source_notes( + &mut self, + noop_image_source: &PlanningNoopImageSourceStepReport, + ) { + let PlanningNoopImageSourceStepReport { + no_target_release, + skipped_sled_zones, + skipped_sled_host_phase_2, + skipped_zones, + converted, + } = noop_image_source; + + // If there's no target release, we can't do any updates nor do any + // no-op conversions, so just add a single note and bail out. + if *no_target_release { + self.notes.push( + "no update possible: no target release has been set" + .to_string(), + ); + return; + } + + // Report notes for sleds we skipped. + { + let mut nsleds_missing_from_inv = 0; + let mut nsleds_error_manifest = 0; + let mut nsleds_mupdate_override = 0; + for reason in skipped_sled_zones.values() { + use PlanningNoopImageSourceSkipSledZonesReason::*; + + match reason { + AllZonesAlreadyArtifact { .. } => { + // This is the normal steady state; nothing to note. + } + SledNotInInventory => { + nsleds_missing_from_inv += 1; + } + ErrorRetrievingZoneManifest { .. } => { + nsleds_error_manifest += 1; + } + RemoveMupdateOverride { .. } => { + nsleds_mupdate_override += 1; + } + } + } + // This loop does nothing, but we write it explicitly so if we add + // more reasons in the future, we have a chance to update these + // notes. + for reason in skipped_sled_host_phase_2.values() { + use PlanningNoopImageSourceSkipSledHostPhase2Reason::*; + match reason { + BothSlotsAlreadyArtifact => { + // This is the normal steady state; nothing to note. + } + SledNotInInventory => { + // This will already be counted above in + // `nsleds_missing_from_inv`. + } + } + } + + if nsleds_missing_from_inv > 0 { + self.notes.push(format!( + "{nsleds_missing_from_inv} sled{} missing from inventory", + pluralize_s(nsleds_missing_from_inv) + )); + } + + if nsleds_error_manifest > 0 { + self.notes.push(format!( + "{nsleds_error_manifest} sled{} have unexpected errors \ + on their boot disks", + pluralize_s(nsleds_error_manifest) + )); + } + + if nsleds_mupdate_override > 0 { + self.notes.push(format!( + "{nsleds_mupdate_override} sled{} are waiting for a \ + system version that matches their most recent \ + support-driven update (mupdate)", + pluralize_s(nsleds_mupdate_override) + )); + } + } + + // Report notes for zones we skipped. If we skipped the entire sled, + // that will be covered above; this is only for zones on sleds we + // considered. + { + let mut ninvalid_artifact = 0; + let mut nartifact_not_in_repo = 0; + for reason in skipped_zones.values() { + use PlanningNoopImageSourceSkipZoneReason::*; + match reason { + ZoneNotInManifest { .. } => { + // TODO-correctness Should we report anything for this + // reason? It has no constructors at the moment; see + // + } + InvalidArtifact { .. } => { + ninvalid_artifact += 1; + } + ArtifactNotInRepo { .. } => { + nartifact_not_in_repo += 1; + } + } + } + + if ninvalid_artifact > 0 { + self.notes.push(format!( + "{ninvalid_artifact} service zone{} have invalid artifacts", + pluralize_s(ninvalid_artifact) + )); + } + + if nartifact_not_in_repo > 0 { + self.notes.push(format!( + "{nartifact_not_in_repo} service zone{} are waiting for a \ + system version that matches their most recent \ + support-driven update (mupdate)", + pluralize_s(nartifact_not_in_repo) + )); + } + } + + // Report counts of converted artifacts. + { + let mut nsleds = 0; + let mut tot_eligible = 0; + let mut tot_dataset = 0; + let mut tot_eligible_os = 0; + for converted in converted.values() { + let PlanningNoopImageSourceConverted { + num_eligible, + num_dataset, + host_phase_2_slot_a_eligible, + host_phase_2_slot_b_eligible, + } = converted; + + nsleds += 1; + tot_eligible += num_eligible; + tot_dataset += num_dataset; + // We don't want to report 2 OSs updated for each sled, nor do + // we want to complain about failing to convert one slot if we + // were able to convert the other. Count _either_ slot as + // eligible as a success. + if *host_phase_2_slot_a_eligible + || *host_phase_2_slot_b_eligible + { + tot_eligible_os += 1; + } + } + + if nsleds > 0 { + self.notes.push(format!( + "across {nsleds} sled{}, converted {}{}{} to known versions", + pluralize_s(nsleds), + if tot_dataset > 0 { + format_args!( + "{tot_eligible}/{tot_dataset} service zone{}", + pluralize_s(tot_eligible), + ) + } else { + format_args!("") + }, + if tot_dataset > 0 && tot_eligible_os > 0 { + " and " + } else { + "" + }, + if tot_eligible_os > 0 { + format_args!( + "{tot_eligible_os} OS image{}", + pluralize_s(tot_eligible_os), + ) + } else { + format_args!("") + }, + )); + } + } + } + + fn push_mgs_updates_notes( + &mut self, + mgs_updates: &PlanningMgsUpdatesStepReport, + ) { + let PlanningMgsUpdatesStepReport { + blocked_mgs_updates, + // These are covered by the update status API + pending_mgs_updates: _, + } = mgs_updates; + + for update in blocked_mgs_updates { + let serial = &update.baseboard_id.serial_number; + let component = update.component; + let reason = InlineErrorChain::new(&update.reason); + + self.notes.push(format!( + "unable to update {serial} {component}: {reason}", + )); + } + } + + fn push_add_notes(&mut self, add: &PlanningAddStepReport) { + let PlanningAddStepReport { + waiting_on, + sleds_without_zpools_for_ntp_zones, + // summarized by `waiting_on`, so we ignore details + add_update_blocked_reasons: _, + // already included in planner config + add_zones_with_mupdate_override: _, + // not a relevant detail to operators: this is true only for systems + // that have never had a reconfigurator-driven update + target_release_generation_is_one: _, + // sleds waiting for NTP is normal while we're updaing NTP zones + sleds_without_ntp_zones_in_inventory: _, + sleds_waiting_for_ntp_zone: _, + sleds_missing_ntp_zone: _, + // not a relevant detail to operators: these are "operating + // normally" sleds + sleds_getting_ntp_and_discretionary_zones: _, + sleds_missing_crucible_zone, + out_of_eligible_sleds, + // not a relevant detail to operators: this is normal operation + sufficient_zones_exist: _, + // maybe not a relevant detail to operators? we're restoring + // redundancy or finishing an expunge+add update + discretionary_zones_placed: _, + } = add; + + if let Some(waiting_on) = waiting_on { + match waiting_on { + ZoneAddWaitingOn::MupdateBlockers => self.notes.push( + "service updates are waiting for system to recover \ + from support-driven update (mupdate)" + .to_string(), + ), + } + } + + if !sleds_without_zpools_for_ntp_zones.is_empty() { + let n = sleds_without_zpools_for_ntp_zones.len(); + self.notes.push(format!( + "{n} sled{} with no available disks to host NTP service", + pluralize_s(n), + )); + } + + { + let mut nsleds = 0; + let mut npools = 0; + for pools in sleds_missing_crucible_zone.values() { + nsleds += 1; + npools += pools.len(); + } + if npools > 0 { + self.notes.push(format!( + "{npools} disk{} are unavailable across {nsleds} sled{}", + pluralize_s(npools), + pluralize_s(nsleds), + )); + } + } + + { + let mut tot_placed = 0; + let mut tot_wanted_to_place = 0; + for out_of_eligible_sleds in out_of_eligible_sleds.values() { + let PlanningAddOutOfEligibleSleds { placed, wanted_to_place } = + out_of_eligible_sleds; + tot_placed += *placed; + tot_wanted_to_place += *wanted_to_place; + } + if tot_placed < tot_wanted_to_place { + self.notes.push(format!( + "out of eligible sleds: only placed \ + {tot_placed} service{} out of \ + {tot_wanted_to_place} desired", + pluralize_s(tot_placed), + )); + } + } + } + + fn push_zone_updates_notes( + &mut self, + zone_updates: &PlanningZoneUpdatesStepReport, + ) { + let PlanningZoneUpdatesStepReport { + unsafe_zones, + // these are normal update details covered by update status + waiting_zones: _, + waiting_on: _, + out_of_date_zones: _, + expunged_zones: _, + updated_zones: _, + } = zone_updates; + + // Report problems with particular zone types. + { + // Only report each kind of problem once (e.g., if 3/5 cockroach + // nodes are healthy, don't emit the same note twice for both + // unhealthy nodes). + let mut emitted_cockroach = false; + let mut emitted_boundary_ntp = false; + let mut emitted_internal_dns = false; + for kind in unsafe_zones.values() { + let emit = match kind { + ZoneUnsafeToShutdown::Cockroachdb { .. } => { + if !emitted_cockroach { + emitted_cockroach = true; + true + } else { + false + } + } + ZoneUnsafeToShutdown::BoundaryNtp { .. } => { + if !emitted_boundary_ntp { + emitted_boundary_ntp = true; + true + } else { + false + } + } + ZoneUnsafeToShutdown::InternalDns { .. } => { + if !emitted_internal_dns { + emitted_internal_dns = true; + true + } else { + false + } + } + }; + if emit { + // TODO Will we emit these notes during a normal update + // (e.g., when we reboot a sled hosting boundary NTP)? If + // so, we should probably not emit these unless we can get + // more confidence there's actually something wrong. + self.notes.push(format!( + "waiting for condition to clear: {kind}" + )); + } + } + } + } +} + +// Very dumb helper to pluralize words by appending an "s". Only used within +// this file to pluralize simple nouns like "sleds" and "disks". +fn pluralize_s(n: usize) -> &'static str { + if n == 1 { "" } else { "s" } +}