Skip to content

Commit 46b7f4e

Browse files
authored
[nexus] config flag to disable SP ereport ingestion (#8709)
PR #8296 added the `sp_ereport_ingester` background task to Nexus for periodically collecting ereports from SPs via MGS. However, the Hubris PR adding the Hubris task that actually responds to these requests from the control plane, oxidecomputer/hubris#2126, won't make it in until after R16. This means that if we release R16 with a control plane that tries to collect ereports, and a SP firmware that doesn't know how to respond to such requests, the Nexus logs will be littered with 36 log lines like this every 30 seconds: ``` 20:58:04.603Z DEBG 65a11c18-7f59-41ac-b9e7-680627f996e7 (ServerContext): client response background_task = sp_ereport_ingester gateway_url = http://[fd00:1122:3344:108::2]:12225 result = Ok(Response { url: "http://[fd00:1122:3344:108::2]:12225/sp/sled/29/ereports?limit=255&restart_id=00000000-0000-0000-0000-000000000000", status: 503, headers: {"content-type": "application/json", "x-request-id": "35390a4a-6d3a-4683-be88-217267b46da0", "content-length": "224", "date": "Mon, 28 Jul 2025 20:58:04 GMT"} }) 20:58:04.603Z WARN 65a11c18-7f59-41ac-b9e7-680627f996e7 (ServerContext): ereport collection: unanticipated MGS request error: Error Response: status: 503 Service Unavailable; headers: {"content-type": "application/json", "x-request-id": "35390a4a-6d3a-4683-be88-217267b46da0", "content-length": "224", "date": "Mon, 28 Jul 2025 20:58:04 GMT"}; value: Error { error_code: Some("SpCommunicationFailed"), message: "error communicating with SP SpIdentifier { typ: Sled, slot: 29 }: RPC call failed (gave up after 5 attempts)", request_id: "35390a4a-6d3a-4683-be88-217267b46da0" } background_task = sp_ereport_ingester committed_ena = None error = Error Response: status: 503 Service Unavailable; headers: {"content-type": "application/json", "x-request-id": "35390a4a-6d3a-4683-be88-217267b46da0", "content-length": "224", "date": "Mon, 28 Jul 2025 20:58:04 GMT"}; value: Error { error_code: Some("SpCommunicationFailed"), message: "error communicating with SP SpIdentifier { typ: Sled, slot: 29 }: RPC call failed (gave up after 5 attempts)", request_id: "35390a4a-6d3a-4683-be88-217267b46da0" } file = nexus/src/app/background/tasks/ereport_ingester.rs:380 gateway_addr = [fd00:1122:3344:108::2]:12225 restart_id = 00000000-0000-0000-0000-000000000000 (ereporter_restart) slot = 29 sp_type = sled start_ena = None ``` Similarly, MGS will also have a bunch of noisy complaints about these requests failing. The consequences of this are really not terrible: it just means we'll be logging a lot of errors. But it seems mildly unfortunate to be constantly trying to do something that's invariably doomed to failure, and then yelling about how it didn't work. So, this commit adds a config flag for disabling the whole thing, which we can turn on for R16's production Nexus config and then turn back off when the Hubris changes make it in. I did this using a config setting, rather than hard-coding it to always be disabled, because there are also integration tests for this stuff, which will break if we disabled it everywhere.
1 parent 266b629 commit 46b7f4e

File tree

7 files changed

+50
-5
lines changed

7 files changed

+50
-5
lines changed

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2787,7 +2787,7 @@ fn print_task_sp_ereport_ingester(details: &serde_json::Value) {
27872787
use nexus_types::internal_api::background::SpEreportIngesterStatus;
27882788
use nexus_types::internal_api::background::SpEreporterStatus;
27892789

2790-
let SpEreportIngesterStatus { sps, errors } =
2790+
let SpEreportIngesterStatus { sps, errors, disabled } =
27912791
match serde_json::from_value(details.clone()) {
27922792
Err(error) => {
27932793
eprintln!(
@@ -2813,9 +2813,19 @@ fn print_task_sp_ereport_ingester(details: &serde_json::Value) {
28132813
}
28142814
}
28152815

2816-
print_ereporter_status_totals(sps.iter().map(|sp| &sp.status));
2816+
if disabled {
2817+
println!(" SP ereport ingestion explicitly disabled by config!");
2818+
} else {
2819+
print_ereporter_status_totals(sps.iter().map(|sp| &sp.status));
2820+
}
28172821

28182822
if !sps.is_empty() {
2823+
if disabled {
2824+
println!(
2825+
"/!\\ WEIRD: SP ereport ingestion disabled by config, but \
2826+
some SP statuses were recorded!"
2827+
)
2828+
}
28192829
println!("\n service processors:");
28202830
for SpEreporterStatus { sp_type, slot, status } in &sps {
28212831
println!(

nexus-config/src/nexus_config.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -815,11 +815,20 @@ pub struct SpEreportIngesterConfig {
815815
/// period (in seconds) for periodic activations of this background task
816816
#[serde_as(as = "DurationSeconds<u64>")]
817817
pub period_secs: Duration,
818+
819+
/// disable ereport collection altogether
820+
///
821+
/// This is an emergency lever for support / operations. It should never be
822+
/// necessary.
823+
///
824+
/// Default: Off
825+
#[serde(default)]
826+
pub disable: bool,
818827
}
819828

820829
impl Default for SpEreportIngesterConfig {
821830
fn default() -> Self {
822-
Self { period_secs: Duration::from_secs(30) }
831+
Self { period_secs: Duration::from_secs(30), disable: false }
823832
}
824833
}
825834

@@ -1320,6 +1329,7 @@ mod test {
13201329
},
13211330
sp_ereport_ingester: SpEreportIngesterConfig {
13221331
period_secs: Duration::from_secs(47),
1332+
disable: false,
13231333
},
13241334
},
13251335
default_region_allocation_strategy:

nexus/src/app/background/init.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,10 @@ impl BackgroundTasksInitializer {
985985
description: "collects error reports from service processors",
986986
period: config.sp_ereport_ingester.period_secs,
987987
task_impl: Box::new(ereport_ingester::SpEreportIngester::new(
988-
datastore, resolver, nexus_id,
988+
datastore,
989+
resolver,
990+
nexus_id,
991+
config.sp_ereport_ingester.disable,
989992
)),
990993
opctx: opctx.child(BTreeMap::new()),
991994
watchers: vec![],

nexus/src/app/background/tasks/ereport_ingester.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ use std::sync::Arc;
3131

3232
pub struct SpEreportIngester {
3333
resolver: internal_dns_resolver::Resolver,
34+
disabled: bool,
3435
inner: Ingester,
3536
}
3637

@@ -58,15 +59,24 @@ impl SpEreportIngester {
5859
datastore: Arc<DataStore>,
5960
resolver: internal_dns_resolver::Resolver,
6061
nexus_id: OmicronZoneUuid,
62+
disabled: bool,
6163
) -> Self {
62-
Self { resolver, inner: Ingester { datastore, nexus_id } }
64+
Self { resolver, inner: Ingester { datastore, nexus_id }, disabled }
6365
}
6466

6567
async fn actually_activate(
6668
&mut self,
6769
opctx: &OpContext,
6870
) -> SpEreportIngesterStatus {
6971
let mut status = SpEreportIngesterStatus::default();
72+
if self.disabled {
73+
status.disabled = true;
74+
slog::trace!(
75+
&opctx.log,
76+
"SP ereport ingestion disabled, doing nothing",
77+
);
78+
return status;
79+
}
7080
// Find MGS clients.
7181
// TODO(eliza): reuse the same client across activations; qorb, etc.
7282
let mgs_clients = {
@@ -478,6 +488,7 @@ mod tests {
478488
datastore.clone(),
479489
nexus.internal_resolver.clone(),
480490
nexus.id(),
491+
false,
481492
);
482493

483494
let activation1 = ingester.actually_activate(&opctx).await;

nexus/types/src/internal_api/background.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,9 @@ pub struct ReadOnlyRegionReplacementStartStatus {
555555

556556
#[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)]
557557
pub struct SpEreportIngesterStatus {
558+
/// If `true`, then ereport ingestion has been explicitly disabled by
559+
/// the config file.
560+
pub disabled: bool,
558561
pub sps: Vec<SpEreporterStatus>,
559562
pub errors: Vec<String>,
560563
}

smf/nexus/multi-sled/config-partial.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ read_only_region_replacement_start.period_secs = 30
8383
alert_dispatcher.period_secs = 60
8484
webhook_deliverator.period_secs = 60
8585
sp_ereport_ingester.period_secs = 30
86+
# Disabled in R16, as the Hubris task that handles ereport ingestion requests
87+
# has not merged yet, and trying to ingest them will just result in Nexus
88+
# logging a bunch of errors.
89+
sp_ereport_ingester.disable = true
8690

8791
[default_region_allocation_strategy]
8892
# by default, allocate across 3 distinct sleds

smf/nexus/single-sled/config-partial.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ read_only_region_replacement_start.period_secs = 30
8383
alert_dispatcher.period_secs = 60
8484
webhook_deliverator.period_secs = 60
8585
sp_ereport_ingester.period_secs = 30
86+
# Disabled in R16, as the Hubris task that handles ereport ingestion requests
87+
# has not merged yet, and trying to ingest them will just result in Nexus
88+
# logging a bunch of errors.
89+
sp_ereport_ingester.disable = true
8690

8791
[default_region_allocation_strategy]
8892
# by default, allocate without requirement for distinct sleds.

0 commit comments

Comments
 (0)