diff --git a/Cargo.lock b/Cargo.lock index 9cbb3b518e..8b8899fa74 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8067,6 +8067,7 @@ dependencies = [ "gateway-client", "gateway-messages", "gateway-test-utils", + "gateway-types", "headers", "hex", "hickory-resolver 0.25.2", diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 9b6b1bfe3c..7c752b11cf 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -37,6 +37,7 @@ dropshot.workspace = true fatfs.workspace = true futures.workspace = true gateway-client.workspace = true +gateway-types.workspace = true headers.workspace = true hex.workspace = true hickory-resolver.workspace = true diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index fc32ce8915..3d9d2e787a 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -20,9 +20,11 @@ use futures::stream::FuturesUnordered; use gateway_client::Client as MgsClient; use gateway_client::types::SpIdentifier; use gateway_client::types::SpIgnition; +use gateway_types::component::SpType; use internal_dns_resolver::Resolver; use internal_dns_types::names::ServiceName; use nexus_db_model::Ereport; +use nexus_db_model::Sled; use nexus_db_model::SupportBundle; use nexus_db_model::SupportBundleState; use nexus_db_queries::authz; @@ -47,9 +49,11 @@ use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::SupportBundleUuid; use omicron_uuid_kinds::ZpoolUuid; use parallel_task_set::ParallelTaskSet; +use serde::Serialize; use serde_json::json; use sha2::{Digest, Sha256}; use slog_error_chain::InlineErrorChain; +use std::collections::BTreeMap; use std::future::Future; use std::io::Write; use std::num::NonZeroU64; @@ -61,6 +65,7 @@ use tokio::io::AsyncWriteExt; use tokio::io::SeekFrom; use tokio_util::task::AbortOnDropHandle; use tufaceous_artifact::ArtifactHash; +use uuid::Uuid; use zip::ZipArchive; use zip::ZipWriter; use zip::write::FullFileOptions; @@ -707,23 +712,44 @@ impl BundleCollection { None }; - let sp_dumps_dir = dir.path().join("sp_task_dumps"); - tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| { - format!("failed to create SP task dump directory {sp_dumps_dir}") - })?; - if let Err(e) = - save_all_sp_dumps(log, &self.resolver, &sp_dumps_dir).await - { - error!(log, "failed to capture SP task dumps"; "error" => InlineErrorChain::new(e.as_ref())); - } else { - report.listed_sps = true; - }; - - if let Ok(all_sleds) = self + let all_sleds = self .datastore .sled_list_all_batched(&self.opctx, SledFilter::InService) + .await; + + if let Ok(mgs_client) = self.create_mgs_client().await { + if let Err(e) = write_sled_info( + &self.log, + &mgs_client, + all_sleds.as_deref().ok(), + dir.path(), + ) .await - { + { + error!(log, "Failed to write sled_info.json"; "error" => InlineErrorChain::new(e.as_ref())); + } + + let sp_dumps_dir = dir.path().join("sp_task_dumps"); + tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context( + || { + format!( + "Failed to create SP task dump directory {sp_dumps_dir}" + ) + }, + )?; + + if let Err(e) = + save_all_sp_dumps(log, &mgs_client, &sp_dumps_dir).await + { + error!(log, "Failed to capture SP task dumps"; "error" => InlineErrorChain::new(e.as_ref())); + } else { + report.listed_sps = true; + }; + } else { + warn!(log, "No MGS client, skipping SP task dump collection"); + } + + if let Ok(all_sleds) = all_sleds { report.listed_in_service_sleds = true; const MAX_CONCURRENT_SLED_REQUESTS: usize = 16; @@ -1031,6 +1057,20 @@ impl BundleCollection { ); Ok(()) } + + async fn create_mgs_client(&self) -> anyhow::Result { + self + .resolver + .lookup_socket_v6(ServiceName::ManagementGatewayService) + .await + .map(|sockaddr| { + let url = format!("http://{}", sockaddr); + gateway_client::Client::new(&url, self.log.clone()) + }).map_err(|e| { + error!(self.log, "failed to resolve MGS address"; "error" => InlineErrorChain::new(&e)); + e.into() + }) + } } impl BackgroundTask for SupportBundleCollector { @@ -1316,18 +1356,9 @@ where /// Collect task dumps from all SPs via MGS and save them to a directory. async fn save_all_sp_dumps( log: &slog::Logger, - resolver: &Resolver, + mgs_client: &MgsClient, sp_dumps_dir: &Utf8Path, ) -> anyhow::Result<()> { - let mgs_client = resolver - .lookup_socket_v6(ServiceName::ManagementGatewayService) - .await - .map(|sockaddr| { - let url = format!("http://{}", sockaddr); - gateway_client::Client::new(&url, log.clone()) - }) - .context("failed to resolve address of MGS")?; - let available_sps = get_available_sps(&mgs_client).await?; let mut tasks = ParallelTaskSet::new(); @@ -1412,6 +1443,82 @@ async fn save_sp_dumps( Ok(()) } +/// Write a file with a JSON mapping of sled serial numbers to cubby and UUIDs for easier +/// identification of sleds present in a bundle. +async fn write_sled_info( + log: &slog::Logger, + mgs_client: &MgsClient, + nexus_sleds: Option<&[Sled]>, + dir: &Utf8Path, +) -> anyhow::Result<()> { + #[derive(Serialize)] + struct SledInfo { + cubby: Option, + uuid: Option, + } + + let available_sps = get_available_sps(&mgs_client) + .await + .context("failed to get available SPs")?; + + // We can still get a useful mapping of cubby to serial using just the data from MGS. + let mut nexus_map: BTreeMap<_, _> = nexus_sleds + .unwrap_or_default() + .into_iter() + .map(|sled| (sled.serial_number(), sled)) + .collect(); + + let mut sled_info = BTreeMap::new(); + for sp in + available_sps.into_iter().filter(|sp| matches!(sp.type_, SpType::Sled)) + { + let sp_state = match mgs_client.sp_get(&sp.type_, sp.slot).await { + Ok(s) => s.into_inner(), + Err(e) => { + error!(log, + "Failed to get SP state for sled_info.json"; + "cubby" => sp.slot, + "component" => %sp.type_, + "error" => InlineErrorChain::new(&e) + ); + continue; + } + }; + + if let Some(sled) = nexus_map.remove(sp_state.serial_number.as_str()) { + sled_info.insert( + sp_state.serial_number.to_string(), + SledInfo { + cubby: Some(sp.slot), + uuid: Some(*sled.identity.id.as_untyped_uuid()), + }, + ); + } else { + sled_info.insert( + sp_state.serial_number.to_string(), + SledInfo { cubby: Some(sp.slot), uuid: None }, + ); + } + } + + // Sleds not returned by MGS. + for (serial, sled) in nexus_map { + sled_info.insert( + serial.to_string(), + SledInfo { + cubby: None, + uuid: Some(*sled.identity.id.as_untyped_uuid()), + }, + ); + } + + let json = serde_json::to_string_pretty(&sled_info) + .context("failed to serialize sled info to JSON")?; + tokio::fs::write(dir.join("sled_info.json"), json).await?; + + Ok(()) +} + fn is_fs_safe_single_path_component(s: &str) -> bool { // Might be path traversal... if s == "." || s == ".." {