Skip to content

Commit d581075

Browse files
authored
[nexus] Snarf ereports from CRDB into support bundles (#8739)
PR #8269 added CRDB tables for storing ereports received from both service processors and the sled host OS. These ereports are generated to indicate a fault or other important event, so they contain information that's probably worth including in service bundles. So we should do that. This branch adds code to the `SupportBundleCollector` background task for querying the database for ereports and putting them in the bundle. This, in turn, required adding code for querying ereports over a specified time range. The `BundleRequest` can be constructed with a set of filters for ereports, including the time window, and a list of serial numbers to collect ereports from. Presently, we always just use the default: we collect ereports from all serial numbers from the last 7 days prior to bundle collection. But, I anticipate that this will be used more in the future when we add a notion of targeted support bundles: for instance, if we generate a support bundle for a particular sled, we would probably only grab ereports from that sled. Ereports are stored in an `ereports` directory in the bundle, with subdirectories for each serial number that emitted an ereport. Each serial number directory has a subdirectory for each ereport restart ID of that serial, and the individual ereports are stored within the restart ID directory as JSON files. The path to an individual ereport will be `ereports/${SERIAL_NUMBER}/${RESTART_ID}/${ENA}.json`. I'm open to changing this organization scheme if others think there's a better approach --- for example, we could place the restart ID in the filename rather than in a subdirectory if that would be more useful. Ereport collection is done in parallel to the rest of the support bundle collection by spawning Tokio tasks to collect host OS and service processor ereports. `tokio_util::task::AbortOnDropHandle` is used to wrap the `JoinHandle`s for these tasks to ensure they're aborted if the ereport collection future is dropped, so that we stop collecting ereports if the support bundle is cancelled. Fixes #8649
1 parent 68a8c4b commit d581075

File tree

11 files changed

+673
-18
lines changed

11 files changed

+673
-18
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dev-tools/omdb/src/bin/omdb/db/ereport.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,10 @@ async fn cmd_db_ereport_list(
155155
restart_id: restart_id.into_untyped_uuid(),
156156
ena: ena.into(),
157157
class: class.clone(),
158-
source: db::model::Reporter::Sp { sp_type, slot: sp_slot.0 },
158+
source: db::model::Reporter::Sp {
159+
sp_type: sp_type.into(),
160+
slot: sp_slot.0,
161+
},
159162
serial: serial_number.as_deref(),
160163
part_number: part_number.as_deref(),
161164
}
@@ -547,7 +550,10 @@ async fn cmd_db_ereporters(
547550
)| {
548551
ReporterRow {
549552
first_seen,
550-
identity: db::model::Reporter::Sp { slot: slot.0, sp_type },
553+
identity: db::model::Reporter::Sp {
554+
slot: slot.0,
555+
sp_type: sp_type.into(),
556+
},
551557
serial,
552558
part_number,
553559
id: restart_id,

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus;
6666
use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus;
6767
use nexus_types::internal_api::background::SupportBundleCleanupReport;
6868
use nexus_types::internal_api::background::SupportBundleCollectionReport;
69+
use nexus_types::internal_api::background::SupportBundleEreportStatus;
6970
use nexus_types::internal_api::background::TufArtifactReplicationCounters;
7071
use nexus_types::internal_api::background::TufArtifactReplicationRequest;
7172
use nexus_types::internal_api::background::TufArtifactReplicationStatus;
@@ -2414,6 +2415,8 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) {
24142415
listed_in_service_sleds,
24152416
listed_sps,
24162417
activated_in_db_ok,
2418+
sp_ereports,
2419+
host_ereports,
24172420
}) = collection_report
24182421
{
24192422
println!(" Support Bundle Collection Report:");
@@ -2427,6 +2430,26 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) {
24272430
println!(
24282431
" Bundle was activated in the database: {activated_in_db_ok}"
24292432
);
2433+
print_ereport_status("SP", &sp_ereports);
2434+
print_ereport_status("Host OS", &host_ereports);
2435+
}
2436+
}
2437+
}
2438+
2439+
fn print_ereport_status(which: &str, status: &SupportBundleEreportStatus) {
2440+
match status {
2441+
SupportBundleEreportStatus::NotRequested => {
2442+
println!(" {which} ereport collection was not requested");
2443+
}
2444+
SupportBundleEreportStatus::Failed { error, n_collected } => {
2445+
println!(" {which} ereport collection failed:");
2446+
println!(
2447+
" ereports collected successfully: {n_collected}"
2448+
);
2449+
println!(" error: {error}");
2450+
}
2451+
SupportBundleEreportStatus::Collected { n_collected } => {
2452+
println!(" {which} ereports collected: {n_collected}");
24302453
}
24312454
}
24322455
}

nexus/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ tempfile.workspace = true
107107
thiserror.workspace = true
108108
tokio = { workspace = true, features = ["full"] }
109109
tokio-postgres = { workspace = true, features = ["with-serde_json-1"] }
110-
tokio-util = { workspace = true, features = ["codec"] }
110+
tokio-util = { workspace = true, features = ["codec", "rt"] }
111111
tough.workspace = true
112112
tufaceous-artifact.workspace = true
113113
usdt.workspace = true

nexus/db-model/src/ereport.rs

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,14 @@ where
6363
}
6464
}
6565

66-
#[derive(Clone, Debug)]
66+
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
6767
pub struct Ereport {
68+
#[serde(flatten)]
6869
pub id: EreportId,
70+
#[serde(flatten)]
6971
pub metadata: EreportMetadata,
7072
pub reporter: Reporter,
73+
#[serde(flatten)]
7174
pub report: serde_json::Value,
7275
}
7376

@@ -96,7 +99,7 @@ impl From<SpEreport> for Ereport {
9699
serial_number,
97100
class,
98101
},
99-
reporter: Reporter::Sp { sp_type, slot: sp_slot.0 },
102+
reporter: Reporter::Sp { sp_type: sp_type.into(), slot: sp_slot.0 },
100103
report,
101104
}
102105
}
@@ -131,7 +134,7 @@ impl From<HostEreport> for Ereport {
131134
}
132135
}
133136

134-
#[derive(Clone, Debug)]
137+
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
135138
pub struct EreportMetadata {
136139
pub time_collected: DateTime<Utc>,
137140
pub time_deleted: Option<DateTime<Utc>>,
@@ -141,22 +144,40 @@ pub struct EreportMetadata {
141144
pub class: Option<String>,
142145
}
143146

144-
#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
147+
#[derive(
148+
Clone,
149+
Debug,
150+
Eq,
151+
PartialEq,
152+
Ord,
153+
PartialOrd,
154+
serde::Serialize,
155+
serde::Deserialize,
156+
)]
145157
pub enum Reporter {
146-
Sp { sp_type: SpType, slot: u16 },
158+
Sp { sp_type: nexus_types::inventory::SpType, slot: u16 },
147159
HostOs { sled: SledUuid },
148160
}
149161

150162
impl std::fmt::Display for Reporter {
151163
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
152164
match self {
153-
Self::Sp { sp_type: SpType::Sled, slot } => {
165+
Self::Sp {
166+
sp_type: nexus_types::inventory::SpType::Sled,
167+
slot,
168+
} => {
154169
write!(f, "Sled (SP) {slot:02}")
155170
}
156-
Self::Sp { sp_type: SpType::Switch, slot } => {
171+
Self::Sp {
172+
sp_type: nexus_types::inventory::SpType::Switch,
173+
slot,
174+
} => {
157175
write!(f, "Switch {slot}")
158176
}
159-
Self::Sp { sp_type: SpType::Power, slot } => {
177+
Self::Sp {
178+
sp_type: nexus_types::inventory::SpType::Power,
179+
slot,
180+
} => {
160181
write!(f, "PSC {slot}")
161182
}
162183
Self::HostOs { sled } => {

nexus/db-queries/src/db/datastore/ereport.rs

Lines changed: 166 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use crate::db::model::SpMgsSlot;
1717
use crate::db::model::SpType;
1818
use crate::db::model::SqlU16;
1919
use crate::db::model::SqlU32;
20-
use crate::db::pagination::paginated;
20+
use crate::db::pagination::{paginated, paginated_multicolumn};
2121
use async_bb8_diesel::AsyncRunQueryDsl;
2222
use chrono::DateTime;
2323
use chrono::Utc;
@@ -48,6 +48,44 @@ pub struct EreporterRestartBySerial {
4848
pub ereports: u32,
4949
}
5050

51+
/// A set of filters for fetching ereports.
52+
#[derive(Clone, Debug, Default)]
53+
pub struct EreportFilters {
54+
/// If present, include only ereports that were collected at the specified
55+
/// timestamp or later.
56+
///
57+
/// If `end_time` is also present, this value *must* be earlier than
58+
/// `end_time`.
59+
pub start_time: Option<DateTime<Utc>>,
60+
/// If present, include only ereports that were collected at the specified
61+
/// timestamp or before.
62+
///
63+
/// If `start_time` is also present, this value *must* be later than
64+
/// `start_time`.
65+
pub end_time: Option<DateTime<Utc>>,
66+
/// If this list is non-empty, include only ereports that were reported by
67+
/// systems with the provided serial numbers.
68+
pub only_serials: Vec<String>,
69+
/// If this list is non-empty, include only ereports with the provided class
70+
/// strings.
71+
// TODO(eliza): globbing could be nice to add here eventually...
72+
pub only_classes: Vec<String>,
73+
}
74+
75+
impl EreportFilters {
76+
fn check_time_range(&self) -> Result<(), Error> {
77+
if let (Some(start), Some(end)) = (self.start_time, self.end_time) {
78+
if start > end {
79+
return Err(Error::invalid_request(
80+
"start time must be before end time",
81+
));
82+
}
83+
}
84+
85+
Ok(())
86+
}
87+
}
88+
5189
impl DataStore {
5290
/// Fetch an ereport by its restart ID and ENA.
5391
///
@@ -93,6 +131,90 @@ impl DataStore {
93131
Err(Error::non_resourcetype_not_found(format!("ereport {id}")))
94132
}
95133

134+
pub async fn host_ereports_fetch_matching(
135+
&self,
136+
opctx: &OpContext,
137+
filters: &EreportFilters,
138+
pagparams: &DataPageParams<'_, (Uuid, DbEna)>,
139+
) -> ListResultVec<HostEreport> {
140+
opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?;
141+
filters.check_time_range()?;
142+
143+
let mut query = paginated_multicolumn(
144+
host_dsl::host_ereport,
145+
(host_dsl::restart_id, host_dsl::ena),
146+
pagparams,
147+
)
148+
.filter(host_dsl::time_deleted.is_null())
149+
.select(HostEreport::as_select());
150+
151+
if let Some(start) = filters.start_time {
152+
query = query.filter(host_dsl::time_collected.ge(start));
153+
}
154+
155+
if let Some(end) = filters.end_time {
156+
query = query.filter(host_dsl::time_collected.le(end));
157+
}
158+
159+
if !filters.only_serials.is_empty() {
160+
query = query.filter(
161+
host_dsl::sled_serial.eq_any(filters.only_serials.clone()),
162+
);
163+
}
164+
165+
if !filters.only_classes.is_empty() {
166+
query = query
167+
.filter(host_dsl::class.eq_any(filters.only_classes.clone()));
168+
}
169+
170+
query
171+
.load_async(&*self.pool_connection_authorized(opctx).await?)
172+
.await
173+
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
174+
}
175+
176+
pub async fn sp_ereports_fetch_matching(
177+
&self,
178+
opctx: &OpContext,
179+
filters: &EreportFilters,
180+
pagparams: &DataPageParams<'_, (Uuid, DbEna)>,
181+
) -> ListResultVec<SpEreport> {
182+
opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?;
183+
filters.check_time_range()?;
184+
185+
let mut query = paginated_multicolumn(
186+
sp_dsl::sp_ereport,
187+
(sp_dsl::restart_id, sp_dsl::ena),
188+
pagparams,
189+
)
190+
.filter(sp_dsl::time_deleted.is_null())
191+
.select(SpEreport::as_select());
192+
193+
if let Some(start) = filters.start_time {
194+
query = query.filter(sp_dsl::time_collected.ge(start));
195+
}
196+
197+
if let Some(end) = filters.end_time {
198+
query = query.filter(sp_dsl::time_collected.le(end));
199+
}
200+
201+
if !filters.only_serials.is_empty() {
202+
query = query.filter(
203+
sp_dsl::serial_number.eq_any(filters.only_serials.clone()),
204+
);
205+
}
206+
207+
if !filters.only_classes.is_empty() {
208+
query = query
209+
.filter(sp_dsl::class.eq_any(filters.only_classes.clone()));
210+
}
211+
212+
query
213+
.load_async(&*self.pool_connection_authorized(opctx).await?)
214+
.await
215+
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
216+
}
217+
96218
/// List ereports from the SP with the given restart ID.
97219
pub async fn sp_ereport_list_by_restart(
98220
&self,
@@ -144,7 +266,7 @@ impl DataStore {
144266
EreporterRestartBySerial {
145267
id: EreporterRestartUuid::from_untyped_uuid(restart_id),
146268
reporter_kind: Reporter::Sp {
147-
sp_type,
269+
sp_type: sp_type.into(),
148270
slot: sp_slot.into(),
149271
},
150272
first_seen_at: first_seen.expect(FIRST_SEEN_NOT_NULL),
@@ -261,8 +383,20 @@ impl DataStore {
261383
sled_id: SledUuid,
262384
) -> Result<Option<ereport_types::EreportId>, Error> {
263385
opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?;
386+
self.host_latest_ereport_id_on_conn(
387+
&*self.pool_connection_authorized(opctx).await?,
388+
sled_id,
389+
)
390+
.await
391+
}
392+
393+
async fn host_latest_ereport_id_on_conn(
394+
&self,
395+
conn: &async_bb8_diesel::Connection<DbConnection>,
396+
sled_id: SledUuid,
397+
) -> Result<Option<ereport_types::EreportId>, Error> {
264398
let id = Self::host_latest_ereport_id_query(sled_id)
265-
.get_result_async(&*self.pool_connection_authorized(opctx).await?)
399+
.get_result_async(conn)
266400
.await
267401
.optional()
268402
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?
@@ -314,6 +448,35 @@ impl DataStore {
314448
})?;
315449
Ok((created, latest))
316450
}
451+
452+
pub async fn host_ereports_insert(
453+
&self,
454+
opctx: &OpContext,
455+
sled_id: SledUuid,
456+
ereports: Vec<HostEreport>,
457+
) -> CreateResult<(usize, Option<ereport_types::EreportId>)> {
458+
opctx.authorize(authz::Action::CreateChild, &authz::FLEET).await?;
459+
let conn = self.pool_connection_authorized(opctx).await?;
460+
let created = diesel::insert_into(host_dsl::host_ereport)
461+
.values(ereports)
462+
.on_conflict((host_dsl::restart_id, host_dsl::ena))
463+
.do_nothing()
464+
.execute_async(&*conn)
465+
.await
466+
.map_err(|e| {
467+
public_error_from_diesel(e, ErrorHandler::Server)
468+
.internal_context("failed to insert ereports")
469+
})?;
470+
let latest = self
471+
.host_latest_ereport_id_on_conn(&conn, sled_id)
472+
.await
473+
.map_err(|e| {
474+
e.internal_context(format!(
475+
"failed to refresh latest ereport ID for {sled_id}"
476+
))
477+
})?;
478+
Ok((created, latest))
479+
}
317480
}
318481

319482
fn id_from_tuple(

nexus/db-queries/src/db/datastore/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ mod zpool;
121121
pub use address_lot::AddressLotCreateResult;
122122
pub use dns::DataStoreDnsTest;
123123
pub use dns::DnsVersionUpdateBuilder;
124+
pub use ereport::EreportFilters;
124125
pub use instance::{
125126
InstanceAndActiveVmm, InstanceGestalt, InstanceStateComputer,
126127
};

0 commit comments

Comments
 (0)