Skip to content

Commit 6ab7e96

Browse files
authored
[nexus] Add part to service bundle ereport paths (#8767)
In #8739, I added code for collecting ereports into support bundles which stores the ereport JSON in directories for each sled/switch/PSC serial number from which an ereport was received. Unfortunately, I failed failed to consider that the version 1 Oxide serial numbers are only unique within the namespace of a particular part, and not globally --- so (for example) a switch and a compute sled may have colliding serials. This means that the current code could incorrectly group ereports reported by two totally different devices. While the ereport JSON files _do_ contain additional information that disambiguates this (it includes includes the part number, as well as MGS metadata with the SP type for SP ereports), and restart IDs are additionally capable of distinguishing between reporters, putting ereports from two different systems within the same directory still has the potential to be quite misleading. Thus, this branch changes the paths for ereports to include the part number as well as the serial number, in the format: ``` {part_number}-{serial_number}/{restart_id}/{ENA}.json ``` In order to include part numbers for host OS ereports, I decided to add a part number column to the `host_ereport` table as well. Initially, I had opted not to do this, as I was thinking that, since `host_ereport` includes a sled UUID, we could just join with the `sled` table to get the part number. However, it occurred to me that ereports may be received from a sled that's later expunged from the rack, and the `sled` record for the sled may eventually be deleted, so such a join would fail. We might retain such ereports past the lifetime of the sled in the rack. So, I thought it was better to always include the part number in the ereport record. I've added a migration that attempts to backfill the `host_ereport.part_number` column from the `sled` table for existing host OS ereport records. In practice, this won't do anything, since we're not collecting them yet,but it seemed nice to have. Sadly, the column had to be left nullable, since we may theoretically encounter an ereport with a sled UUID that points to an already-deleted sled record, but...whatever. Since there aren't currently any host OS ereport records anyway, this shouldn't happen, and we'll just handle the nullability; this isn't terrible as we must already do so for SP ereport records. Fixes #8765
1 parent 60a5cf0 commit 6ab7e96

File tree

7 files changed

+56
-22
lines changed

7 files changed

+56
-22
lines changed

nexus/db-model/src/ereport.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,15 @@ impl From<HostEreport> for Ereport {
117117
sled_id,
118118
class,
119119
report,
120+
part_number,
120121
} = host_report;
121122
Ereport {
122123
id: EreportId { restart_id: restart_id.into(), ena: ena.into() },
123124
metadata: EreportMetadata {
124125
time_collected,
125126
time_deleted,
126127
collector_id: collector_id.into(),
127-
part_number: None, // TODO
128+
part_number,
128129
serial_number: Some(sled_serial),
129130
class,
130131
},
@@ -253,4 +254,10 @@ pub struct HostEreport {
253254
pub class: Option<String>,
254255

255256
pub report: serde_json::Value,
257+
258+
// It's a shame this has to be nullable, while the serial is not. However,
259+
// this field was added in a migration, and we have to be able to handle the
260+
// case where a sled record was hard-deleted when backfilling the ereport
261+
// table's part_number column. Sad.
262+
pub part_number: Option<String>,
256263
}

nexus/db-model/src/schema_versions.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
1616
///
1717
/// This must be updated when you change the database schema. Refer to
1818
/// schema/crdb/README.adoc in the root of this repository for details.
19-
pub const SCHEMA_VERSION: Version = Version::new(176, 0, 0);
19+
pub const SCHEMA_VERSION: Version = Version::new(177, 0, 0);
2020

2121
/// List of all past database schema versions, in *reverse* order
2222
///
@@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
2828
// | leaving the first copy as an example for the next person.
2929
// v
3030
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
31+
KnownVersion::new(177, "add-host-ereport-part-number"),
3132
KnownVersion::new(176, "audit-log"),
3233
KnownVersion::new(175, "inv-host-phase-1-active-slot"),
3334
KnownVersion::new(174, "add-tuf-rot-by-sign"),

nexus/db-schema/src/schema.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2664,6 +2664,8 @@ table! {
26642664
class -> Nullable<Text>,
26652665

26662666
report -> Jsonb,
2667+
2668+
part_number -> Nullable<Text>,
26672669
}
26682670
}
26692671

nexus/src/app/background/tasks/support_bundle_collector.rs

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1081,32 +1081,41 @@ impl BackgroundTask for SupportBundleCollector {
10811081
}
10821082

10831083
async fn write_ereport(ereport: Ereport, dir: &Utf8Path) -> anyhow::Result<()> {
1084-
// Here's where we construct the file path for each ereport JSON file, given
1085-
// the top-lebel ereport directory path. Each ereport is stored in a
1086-
// subdirectory for the serial number of the system that produced the
1087-
// ereport. These paths take the following form:
1084+
// Here's where we construct the file path for each ereport JSON file,
1085+
// given the top-level ereport directory path. Each ereport is stored in a
1086+
// subdirectory for the part and serial numbers of the system that produced
1087+
// the ereport. Part numbers must be included in addition to serial
1088+
// numbers, as the v1 serial scheme only guarantees uniqueness within a
1089+
// part number. These paths take the following form:
10881090
//
1089-
// {serial_number}/{restart_id}/{ENA}.json
1091+
// {part-number}-{serial_number}/{restart_id}/{ENA}.json
10901092
//
1091-
// We can assume that the restart ID and serial number consist only of
1093+
// We can assume that the restart ID and ENA consist only of
10921094
// filesystem-safe characters, as the restart ID is known to be a UUID, and
1093-
// the ENA is just an integer. For the serial number, which we don't have
1094-
// full control over --- it came from the ereport metadata --- we must
1095-
// check that it doesn't contain any characters unsuitable for use in a
1096-
// filesystem path.
1097-
let sn = &ereport
1095+
// the ENA is just an integer. For the serial and part numbers, which
1096+
// Nexus doesn't have full control over --- it came from the ereport
1097+
// metadata --- we must check that it doesn't contain any characters
1098+
// unsuitable for use in a filesystem path.
1099+
let pn = ereport
1100+
.metadata
1101+
.part_number
1102+
.as_deref()
1103+
// If the part or serial numbers contain any unsavoury characters, it
1104+
// goes in the `unknown_serial` hole! Note that the alleged serial
1105+
// number from the ereport will still be present in the JSON as a
1106+
// string, so we're not *lying* about what was received; we're just
1107+
// giving up on using it in the path.
1108+
.filter(|&s| is_fs_safe_single_path_component(s))
1109+
.unwrap_or("unknown_part");
1110+
let sn = ereport
10981111
.metadata
10991112
.serial_number
11001113
.as_deref()
1101-
// If the serial number contains any unsavoury characters, it goes in
1102-
// the `unknown_serial` hole! Note that the alleged serial number from
1103-
// the ereport will still be present in the JSON as a string, so we're
1104-
// not *lying* about what was received; we're just giving up on using it
1105-
// in the path.
11061114
.filter(|&s| is_fs_safe_single_path_component(s))
11071115
.unwrap_or("unknown_serial");
1116+
11081117
let dir = dir
1109-
.join(sn)
1118+
.join(format!("{pn}-{sn}"))
11101119
// N.B. that we call `into_untyped_uuid()` here, as the `Display`
11111120
// implementation for a typed UUID appends " (ereporter_restart)", which
11121121
// we don't want.
@@ -1116,11 +1125,11 @@ async fn write_ereport(ereport: Ereport, dir: &Utf8Path) -> anyhow::Result<()> {
11161125
.with_context(|| format!("failed to create directory '{dir}'"))?;
11171126
let file_path = dir.join(format!("{}.json", ereport.id.ena));
11181127
let json = serde_json::to_vec(&ereport).with_context(|| {
1119-
format!("failed to serialize ereport '{sn}:{}'", ereport.id)
1128+
format!("failed to serialize ereport {pn}:{sn}/{}", ereport.id)
11201129
})?;
11211130
tokio::fs::write(&file_path, json)
11221131
.await
1123-
.with_context(|| format!("failed to write {file_path}'"))
1132+
.with_context(|| format!("failed to write '{file_path}'"))
11241133
}
11251134

11261135
// Takes a directory "dir", and zips the contents into a single zipfile.
@@ -1701,6 +1710,7 @@ mod test {
17011710
collector_id: OmicronZoneUuid::new_v4().into(),
17021711
sled_id: sled_id.into(),
17031712
sled_serial: HOST_SERIAL.to_string(),
1713+
part_number: Some(GIMLET_PN.to_string()),
17041714
class: Some("ereport.fake.whatever".to_string()),
17051715
report: serde_json::json!({"hello_world": true}),
17061716
},
@@ -1712,6 +1722,7 @@ mod test {
17121722
collector_id: OmicronZoneUuid::new_v4().into(),
17131723
sled_id: sled_id.into(),
17141724
sled_serial: HOST_SERIAL.to_string(),
1725+
part_number: Some(GIMLET_PN.to_string()),
17151726
class: Some("ereport.fake.whatever.thingy".to_string()),
17161727
report: serde_json::json!({"goodbye_world": false}),
17171728
},
@@ -1734,6 +1745,7 @@ mod test {
17341745
collector_id: OmicronZoneUuid::new_v4().into(),
17351746
sled_id: sled_id.into(),
17361747
sled_serial: HOST_SERIAL.to_string(),
1748+
part_number: Some(GIMLET_PN.to_string()),
17371749
class: Some("ereport.something.hostos_related".to_string()),
17381750
report: serde_json::json!({"illumos": "very yes", "whatever": 42}),
17391751
},
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
ALTER TABLE omicron.public.host_ereport
2+
ADD COLUMN IF NOT EXISTS part_number STRING(63);
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
-- backfill host OS ereports part_number column by querying the sled table. if
2+
-- there isn't a sled for the ereport's sled UUID in the sled table (i.e. if the
3+
-- sled record was hard deleted), leave it null.
4+
SET LOCAL disallow_full_table_scans = off;
5+
UPDATE omicron.public.host_ereport
6+
SET part_number = sled.part_number
7+
FROM sled
8+
WHERE host_ereport.sled_id = sled.id;

schema/crdb/dbinit.sql

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6460,6 +6460,8 @@ CREATE TABLE IF NOT EXISTS omicron.public.host_ereport (
64606460
*/
64616461
report JSONB NOT NULL,
64626462

6463+
part_number STRING(63),
6464+
64636465
PRIMARY KEY (restart_id, ena)
64646466
);
64656467

@@ -6495,7 +6497,7 @@ INSERT INTO omicron.public.db_metadata (
64956497
version,
64966498
target_version
64976499
) VALUES
6498-
(TRUE, NOW(), NOW(), '176.0.0', NULL)
6500+
(TRUE, NOW(), NOW(), '177.0.0', NULL)
64996501
ON CONFLICT DO NOTHING;
65006502

65016503
COMMIT;

0 commit comments

Comments
 (0)