Skip to content

Commit 4449f01

Browse files
authored
Nexus: Add an inventory_loader background task (PR 1/2) (#9148)
This adds a background task that periodically attempts to read the latest inventory collection from the DB. It first does a query for just the latest collection ID, then only does the full set of queries to load the entire collection if the ID has changed from the last one it read. Therefore, I set the period pretty aggressively (15 seconds), which is what #5296 suggested. (Neither this PR nor the followup do the _rest_ of #5296; i.e., changing how the `inventory_collector` task works.) As of this PR, we only introduce the task and use it in tests. The followup PR changes all the other background tasks that currently read collections from the DB to read from this task's watch channel instead.
1 parent 5789479 commit 4449f01

File tree

21 files changed

+498
-60
lines changed

21 files changed

+498
-60
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ use nexus_types::internal_api::background::BlueprintRendezvousStatus;
5656
use nexus_types::internal_api::background::EreporterStatus;
5757
use nexus_types::internal_api::background::InstanceReincarnationStatus;
5858
use nexus_types::internal_api::background::InstanceUpdaterStatus;
59+
use nexus_types::internal_api::background::InventoryLoadStatus;
5960
use nexus_types::internal_api::background::LookupRegionPortStatus;
6061
use nexus_types::internal_api::background::ReadOnlyRegionReplacementStartStatus;
6162
use nexus_types::internal_api::background::RegionReplacementDriverStatus;
@@ -1158,6 +1159,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
11581159
"inventory_collection" => {
11591160
print_task_inventory_collection(details);
11601161
}
1162+
"inventory_loader" => {
1163+
print_task_inventory_load(details);
1164+
}
11611165
"lookup_region_port" => {
11621166
print_task_lookup_region_port(details);
11631167
}
@@ -1971,6 +1975,37 @@ fn print_task_inventory_collection(details: &serde_json::Value) {
19711975
};
19721976
}
19731977

1978+
fn print_task_inventory_load(details: &serde_json::Value) {
1979+
match serde_json::from_value::<InventoryLoadStatus>(details.clone()) {
1980+
Err(error) => eprintln!(
1981+
"warning: failed to interpret task details: {:?}: {:?}",
1982+
error, details
1983+
),
1984+
Ok(status) => match status {
1985+
InventoryLoadStatus::Error(error) => {
1986+
println!(" task did not complete successfully: {error}");
1987+
}
1988+
InventoryLoadStatus::NoCollections => {
1989+
println!(" no collections available to load");
1990+
}
1991+
InventoryLoadStatus::Loaded {
1992+
collection_id,
1993+
time_started,
1994+
time_loaded,
1995+
} => {
1996+
println!(
1997+
" loaded latest inventory collection as of {}:",
1998+
humantime::format_rfc3339_millis(time_loaded.into())
1999+
);
2000+
println!(
2001+
" collection {collection_id}, taken at {}",
2002+
humantime::format_rfc3339_millis(time_started.into()),
2003+
);
2004+
}
2005+
},
2006+
};
2007+
}
2008+
19742009
fn print_task_lookup_region_port(details: &serde_json::Value) {
19752010
match serde_json::from_value::<LookupRegionPortStatus>(details.clone()) {
19762011
Ok(LookupRegionPortStatus { found_port_ok, errors }) => {

dev-tools/omdb/tests/env.out

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,10 @@ task: "inventory_collection"
116116
collects hardware and software inventory data from the whole system
117117

118118

119+
task: "inventory_loader"
120+
loads the latest inventory collection from the DB
121+
122+
119123
task: "lookup_region_port"
120124
fill in missing ports for region records
121125

@@ -328,6 +332,10 @@ task: "inventory_collection"
328332
collects hardware and software inventory data from the whole system
329333

330334

335+
task: "inventory_loader"
336+
loads the latest inventory collection from the DB
337+
338+
331339
task: "lookup_region_port"
332340
fill in missing ports for region records
333341

@@ -527,6 +535,10 @@ task: "inventory_collection"
527535
collects hardware and software inventory data from the whole system
528536

529537

538+
task: "inventory_loader"
539+
loads the latest inventory collection from the DB
540+
541+
530542
task: "lookup_region_port"
531543
fill in missing ports for region records
532544

dev-tools/omdb/tests/successes.out

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,10 @@ task: "inventory_collection"
351351
collects hardware and software inventory data from the whole system
352352

353353

354+
task: "inventory_loader"
355+
loads the latest inventory collection from the DB
356+
357+
354358
task: "lookup_region_port"
355359
fill in missing ports for region records
356360

@@ -649,6 +653,13 @@ task: "inventory_collection"
649653
last collection started: <REDACTED_TIMESTAMP>
650654
last collection done: <REDACTED_TIMESTAMP>
651655

656+
task: "inventory_loader"
657+
configured period: every <REDACTED_DURATION>s
658+
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
659+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
660+
loaded latest inventory collection as of <REDACTED_TIMESTAMP>:
661+
collection ..........<REDACTED_UUID>..........., taken at <REDACTED_TIMESTAMP>
662+
652663
task: "lookup_region_port"
653664
configured period: every <REDACTED_DURATION>m
654665
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
@@ -1177,6 +1188,13 @@ task: "inventory_collection"
11771188
last collection started: <REDACTED_TIMESTAMP>
11781189
last collection done: <REDACTED_TIMESTAMP>
11791190

1191+
task: "inventory_loader"
1192+
configured period: every <REDACTED_DURATION>s
1193+
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
1194+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
1195+
loaded latest inventory collection as of <REDACTED_TIMESTAMP>:
1196+
collection ..........<REDACTED_UUID>..........., taken at <REDACTED_TIMESTAMP>
1197+
11801198
task: "lookup_region_port"
11811199
configured period: every <REDACTED_DURATION>m
11821200
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>

nexus-config/src/nexus_config.rs

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -562,13 +562,25 @@ pub struct SwitchPortSettingsManagerConfig {
562562
#[serde_as]
563563
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
564564
pub struct InventoryConfig {
565-
/// period (in seconds) for periodic activations of this background task
565+
/// period (in seconds) for periodic activations of the background task to
566+
/// load the latest inventory collection
567+
///
568+
/// Each activation runs a fast query to check whether there is a new
569+
/// collection, and only follows up with the set of queries required to load
570+
/// its contents if there's been a change. This period should be pretty
571+
/// aggressive to ensure consumers are usually acting on the latest
572+
/// collection.
573+
#[serde_as(as = "DurationSeconds<u64>")]
574+
pub period_secs_load: Duration,
575+
576+
/// period (in seconds) for periodic activations of the background task to
577+
/// collect inventory
566578
///
567579
/// Each activation fetches information about all hardware and software in
568580
/// the system and inserts it into the database. This generates a moderate
569581
/// amount of data.
570582
#[serde_as(as = "DurationSeconds<u64>")]
571-
pub period_secs: Duration,
583+
pub period_secs_collect: Duration,
572584

573585
/// maximum number of past collections to keep in the database
574586
///
@@ -580,7 +592,7 @@ pub struct InventoryConfig {
580592
///
581593
/// This is an emergency lever for support / operations. It should never be
582594
/// necessary.
583-
pub disable: bool,
595+
pub disable_collect: bool,
584596
}
585597

586598
#[serde_as]
@@ -1109,9 +1121,10 @@ mod test {
11091121
external_endpoints.period_secs = 9
11101122
nat_cleanup.period_secs = 30
11111123
bfd_manager.period_secs = 30
1112-
inventory.period_secs = 10
1113-
inventory.nkeep = 11
1114-
inventory.disable = false
1124+
inventory.period_secs_load = 10
1125+
inventory.period_secs_collect = 11
1126+
inventory.nkeep = 12
1127+
inventory.disable_collect = false
11151128
support_bundle_collector.period_secs = 30
11161129
physical_disk_adoption.period_secs = 30
11171130
decommissioned_disk_cleaner.period_secs = 30
@@ -1274,9 +1287,10 @@ mod test {
12741287
period_secs: Duration::from_secs(30),
12751288
},
12761289
inventory: InventoryConfig {
1277-
period_secs: Duration::from_secs(10),
1278-
nkeep: 11,
1279-
disable: false,
1290+
period_secs_load: Duration::from_secs(10),
1291+
period_secs_collect: Duration::from_secs(11),
1292+
nkeep: 12,
1293+
disable_collect: false,
12801294
},
12811295
support_bundle_collector:
12821296
SupportBundleCollectorConfig {
@@ -1448,9 +1462,10 @@ mod test {
14481462
external_endpoints.period_secs = 9
14491463
nat_cleanup.period_secs = 30
14501464
bfd_manager.period_secs = 30
1451-
inventory.period_secs = 10
1465+
inventory.period_secs_load = 10
1466+
inventory.period_secs_collect = 10
14521467
inventory.nkeep = 3
1453-
inventory.disable = false
1468+
inventory.disable_collect = false
14541469
support_bundle_collector.period_secs = 30
14551470
physical_disk_adoption.period_secs = 30
14561471
decommissioned_disk_cleaner.period_secs = 30

nexus/background-task-interface/src/init.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ pub struct BackgroundTasks {
1717
pub task_nat_cleanup: Activator,
1818
pub task_bfd_manager: Activator,
1919
pub task_inventory_collection: Activator,
20+
pub task_inventory_loader: Activator,
2021
pub task_support_bundle_collector: Activator,
2122
pub task_physical_disk_adoption: Activator,
2223
pub task_decommissioned_disk_cleaner: Activator,

nexus/db-queries/src/db/datastore/inventory.rs

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2329,16 +2329,18 @@ impl DataStore {
23292329
})
23302330
}
23312331

2332-
/// Attempt to read the latest collection.
2332+
/// Attempt to get the ID of the latest collection.
23332333
///
23342334
/// If there aren't any collections, return `Ok(None)`.
2335-
pub async fn inventory_get_latest_collection(
2335+
pub async fn inventory_get_latest_collection_id(
23362336
&self,
23372337
opctx: &OpContext,
2338-
) -> Result<Option<Collection>, Error> {
2338+
) -> Result<Option<CollectionUuid>, Error> {
2339+
use nexus_db_schema::schema::inv_collection::dsl;
2340+
23392341
opctx.authorize(authz::Action::Read, &authz::INVENTORY).await?;
23402342
let conn = self.pool_connection_authorized(opctx).await?;
2341-
use nexus_db_schema::schema::inv_collection::dsl;
2343+
23422344
let collection_id = dsl::inv_collection
23432345
.select(dsl::id)
23442346
.order_by(dsl::time_started.desc())
@@ -2347,17 +2349,23 @@ impl DataStore {
23472349
.optional()
23482350
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
23492351

2350-
let Some(collection_id) = collection_id else {
2352+
Ok(collection_id.map(CollectionUuid::from_untyped_uuid))
2353+
}
2354+
2355+
/// Attempt to read the latest collection.
2356+
///
2357+
/// If there aren't any collections, return `Ok(None)`.
2358+
pub async fn inventory_get_latest_collection(
2359+
&self,
2360+
opctx: &OpContext,
2361+
) -> Result<Option<Collection>, Error> {
2362+
let Some(collection_id) =
2363+
self.inventory_get_latest_collection_id(opctx).await?
2364+
else {
23512365
return Ok(None);
23522366
};
23532367

2354-
Ok(Some(
2355-
self.inventory_collection_read(
2356-
opctx,
2357-
CollectionUuid::from_untyped_uuid(collection_id),
2358-
)
2359-
.await?,
2360-
))
2368+
Ok(Some(self.inventory_collection_read(opctx, collection_id).await?))
23612369
}
23622370

23632371
/// Attempt to read the current collection

nexus/examples/config-second.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,13 +114,16 @@ metrics_producer_gc.period_secs = 60
114114
external_endpoints.period_secs = 60
115115
nat_cleanup.period_secs = 30
116116
bfd_manager.period_secs = 30
117+
# How frequently to check for a new inventory collection (made by any Nexus).
118+
# This is cheap, so we should check frequently.
119+
inventory.period_secs_load = 15
117120
# How frequently to collect hardware/software inventory from the whole system
118121
# (even if we don't have reason to believe anything has changed).
119-
inventory.period_secs = 600
122+
inventory.period_secs_collect = 600
120123
# Maximum number of past collections to keep in the database
121124
inventory.nkeep = 5
122125
# Disable inventory collection altogether (for emergencies)
123-
inventory.disable = false
126+
inventory.disable_collect = false
124127
phantom_disks.period_secs = 30
125128
physical_disk_adoption.period_secs = 30
126129
support_bundle_collector.period_secs = 30

nexus/examples/config.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,13 +98,16 @@ metrics_producer_gc.period_secs = 60
9898
external_endpoints.period_secs = 60
9999
nat_cleanup.period_secs = 30
100100
bfd_manager.period_secs = 30
101+
# How frequently to check for a new inventory collection (made by any Nexus).
102+
# This is cheap, so we should check frequently.
103+
inventory.period_secs_load = 15
101104
# How frequently to collect hardware/software inventory from the whole system
102105
# (even if we don't have reason to believe anything has changed).
103-
inventory.period_secs = 600
106+
inventory.period_secs_collect = 600
104107
# Maximum number of past collections to keep in the database
105108
inventory.nkeep = 5
106109
# Disable inventory collection altogether (for emergencies)
107-
inventory.disable = false
110+
inventory.disable_collect = false
108111
phantom_disks.period_secs = 30
109112
physical_disk_adoption.period_secs = 30
110113
support_bundle_collector.period_secs = 30

0 commit comments

Comments
 (0)