Skip to content

Commit f1f94f4

Browse files
authored
Add measurements to sled-agent (#9626)
1 parent 310c086 commit f1f94f4

File tree

22 files changed

+598
-117
lines changed

22 files changed

+598
-117
lines changed

Cargo.lock

Lines changed: 22 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ members = [
136136
"sled-agent/bootstrap-agent-api",
137137
"sled-agent/config-reconciler",
138138
"sled-agent/health-monitor",
139+
"sled-agent/measurements",
139140
"sled-agent/repo-depot-api",
140141
"sled-agent/types",
141142
"sled-agent/types/versions",
@@ -308,6 +309,7 @@ default-members = [
308309
"sled-agent/bootstrap-agent-api",
309310
"sled-agent/config-reconciler",
310311
"sled-agent/health-monitor",
312+
"sled-agent/measurements",
311313
"sled-agent/repo-depot-api",
312314
"sled-agent/types",
313315
"sled-agent/types/versions",
@@ -745,6 +747,7 @@ sled-agent-api = { path = "sled-agent/api" }
745747
sled-agent-client = { path = "clients/sled-agent-client" }
746748
sled-agent-config-reconciler = { path = "sled-agent/config-reconciler" }
747749
sled-agent-health-monitor = { path = "sled-agent/health-monitor" }
750+
sled-agent-measurements = { path = "sled-agent/measurements" }
748751
sled-agent-types = { path = "sled-agent/types" }
749752
sled-agent-types-versions = { path = "sled-agent/types/versions" }
750753
sled-agent-resolvable-files = { path = "sled-agent/resolvable-files" }

sled-agent/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ sled-agent-api.workspace = true
8989
sled-agent-client.workspace = true
9090
sled-agent-config-reconciler.workspace = true
9191
sled-agent-health-monitor.workspace = true
92+
sled-agent-measurements.workspace = true
9293
sled-agent-types.workspace = true
9394
sled-agent-types-versions.workspace = true
9495
sled-agent-resolvable-files.workspace = true

sled-agent/config-reconciler/src/handle.rs

Lines changed: 88 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use slog::Logger;
2222
use std::collections::HashSet;
2323
use std::sync::Arc;
2424
use std::sync::OnceLock;
25+
use tokio::sync::oneshot;
2526
use tokio::sync::watch;
2627

2728
#[cfg(feature = "testing")]
@@ -82,18 +83,29 @@ pub enum TimeSyncConfig {
8283
}
8384

8485
#[derive(Debug)]
85-
pub struct ConfigReconcilerSpawnToken {
86+
struct SpawnTokenCommon {
8687
key_requester: StorageKeyRequester,
8788
time_sync_config: TimeSyncConfig,
8889
reconciler_result_tx: watch::Sender<ReconcilerResult>,
8990
currently_managed_zpools_tx: watch::Sender<Arc<CurrentlyManagedZpools>>,
9091
external_disks_tx: watch::Sender<HashSet<Disk>>,
9192
former_zone_root_archiver: FormerZoneRootArchiver,
9293
raw_disks_rx: RawDisksReceiver,
93-
ledger_task_log: Logger,
9494
reconciler_task_log: Logger,
9595
}
9696

97+
#[derive(Debug)]
98+
pub struct LedgerTaskSpawnToken {
99+
common: SpawnTokenCommon,
100+
ledger_task_log: Logger,
101+
}
102+
103+
#[derive(Debug)]
104+
pub struct ConfigReconcilerSpawnToken {
105+
common: SpawnTokenCommon,
106+
ledger_rx: watch::Receiver<CurrentSledConfig>,
107+
}
108+
97109
#[derive(Debug)]
98110
pub struct ConfigReconcilerHandle {
99111
raw_disks_tx: RawDisksSender,
@@ -110,18 +122,25 @@ impl ConfigReconcilerHandle {
110122
/// Create a `ConfigReconcilerHandle` and spawn many of the early-sled-agent
111123
/// background tasks (e.g., managing internal disks).
112124
///
113-
/// The config reconciler subsystem splits initialization into two phases:
114-
/// the main reconcilation task will not be spawned until
115-
/// `spawn_reconciliation_task()` is called on the returned handle.
125+
/// The config reconciler subsystem splits initialization into three phases:
126+
/// - This function returns a `LedgerTaskSpawnToken`
127+
/// - `spawn_ledger_task` takes a `LedgerTaskSpawnToken` and is responsible
128+
/// for spawning the ledger task and returning a `ConfigReconcilerSpawnToken`.
129+
/// The ledger task needs to be spawned early for access to reference measurements.
130+
/// - The main reconciliation task will not be spawned until
131+
/// `spawn_reconciliation_task()` is called with a `ConfigReconcilerSpawnToken`.
116132
/// `spawn_reconciliation_task()` cannot be called by sled-agent proper
117133
/// until rack setup has occurred (or sled-agent has found its config from a
118134
/// prior rack setup, during a cold boot).
135+
///
136+
/// This is designed to be difficult (if not impossible!) to mess up the
137+
/// ordering of functions to call
119138
pub fn new(
120139
mount_config: MountConfig,
121140
key_requester: StorageKeyRequester,
122141
time_sync_config: TimeSyncConfig,
123142
base_log: &Logger,
124-
) -> (Self, ConfigReconcilerSpawnToken) {
143+
) -> (Self, LedgerTaskSpawnToken) {
125144
let mount_config = Arc::new(mount_config);
126145

127146
// Spawn the task that monitors our internal disks (M.2s).
@@ -168,54 +187,49 @@ impl ConfigReconcilerHandle {
168187
// Stash the dependencies the reconciler task will need in
169188
// `spawn_reconciliation_task()` inside this token that the caller
170189
// has to hold until it has the other outside dependencies ready.
171-
ConfigReconcilerSpawnToken {
172-
key_requester,
173-
time_sync_config,
174-
reconciler_result_tx,
175-
currently_managed_zpools_tx,
176-
external_disks_tx,
177-
former_zone_root_archiver,
178-
raw_disks_rx,
190+
LedgerTaskSpawnToken {
191+
common: SpawnTokenCommon {
192+
key_requester,
193+
time_sync_config,
194+
reconciler_result_tx,
195+
currently_managed_zpools_tx,
196+
external_disks_tx,
197+
former_zone_root_archiver,
198+
raw_disks_rx,
199+
reconciler_task_log: base_log
200+
.new(slog::o!("component" => "ConfigReconcilerTask")),
201+
},
179202
ledger_task_log: base_log
180203
.new(slog::o!("component" => "SledConfigLedgerTask")),
181-
reconciler_task_log: base_log
182-
.new(slog::o!("component" => "ConfigReconcilerTask")),
183204
},
184205
)
185206
}
186207

187-
/// Spawn the primary config reconciliation task.
208+
/// Spawn the ledger task
209+
///
210+
/// This is the first half of spawning the reconciliation task. We need to
211+
/// spawn the ledger task early to allow for access to the ledger for
212+
/// early measurement reconciliation.
188213
///
189214
/// This method can effectively only be called once, because the caller must
190215
/// supply the `token` returned by `new()` when this handle was created.
191216
///
217+
/// This returns a watch channel for use by the measurement handler to
218+
/// know when the ledger task has started running and has found M.2 disks.
219+
///
192220
/// # Panics
193221
///
194222
/// Panics if called multiple times, which is statically impossible outside
195-
/// shenanigans to get a second [`ConfigReconcilerSpawnToken`].
196-
pub fn spawn_reconciliation_task<
197-
T: SledAgentFacilities,
198-
U: SledAgentArtifactStore + Clone,
199-
>(
223+
/// shenanigans to get a second `LedgerTaskSpawnToken`.
224+
pub fn spawn_ledger_task<U: SledAgentArtifactStore + Clone>(
200225
&self,
201-
sled_agent_facilities: T,
202226
sled_agent_artifact_store: U,
203-
token: ConfigReconcilerSpawnToken,
204-
) {
205-
let ConfigReconcilerSpawnToken {
206-
key_requester,
207-
time_sync_config,
208-
reconciler_result_tx,
209-
currently_managed_zpools_tx,
210-
external_disks_tx,
211-
former_zone_root_archiver,
212-
raw_disks_rx,
213-
ledger_task_log,
214-
reconciler_task_log,
215-
} = token;
227+
token: LedgerTaskSpawnToken,
228+
) -> (ConfigReconcilerSpawnToken, oneshot::Receiver<()>) {
229+
let LedgerTaskSpawnToken { common, ledger_task_log } = token;
216230

217231
// Spawn the task that manages our config ledger.
218-
let (ledger_task, current_config_rx) =
232+
let (ledger_task, current_config_rx, ledger_task_run_rx) =
219233
LedgerTaskHandle::spawn_ledger_task(
220234
self.internal_disks_rx.clone(),
221235
sled_agent_artifact_store.clone(),
@@ -227,18 +241,51 @@ impl ConfigReconcilerHandle {
227241
// we document that we panic if called multiple times via some
228242
// multi-token shenanigans.
229243
Err(_) => {
230-
panic!(
231-
"spawn_reconciliation_task() called with multiple tokens"
232-
)
244+
panic!("spawn_ledger_task() called with multiple tokens")
233245
}
234246
}
235247

248+
(
249+
ConfigReconcilerSpawnToken { common, ledger_rx: current_config_rx },
250+
ledger_task_run_rx,
251+
)
252+
}
253+
254+
/// Spawn the primary config reconciliation task.
255+
///
256+
/// This method can effectively only be called once, because the caller must
257+
/// supply the `token` returned by `spawn_ledger_task()` when this handle was created.
258+
///
259+
pub fn spawn_reconciliation_task<
260+
T: SledAgentFacilities,
261+
U: SledAgentArtifactStore + Clone,
262+
>(
263+
&self,
264+
sled_agent_facilities: T,
265+
sled_agent_artifact_store: U,
266+
token: ConfigReconcilerSpawnToken,
267+
) {
268+
let ConfigReconcilerSpawnToken {
269+
common:
270+
SpawnTokenCommon {
271+
key_requester,
272+
time_sync_config,
273+
reconciler_result_tx,
274+
currently_managed_zpools_tx,
275+
external_disks_tx,
276+
former_zone_root_archiver,
277+
raw_disks_rx,
278+
reconciler_task_log,
279+
},
280+
ledger_rx,
281+
} = token;
282+
236283
reconciler_task::spawn(
237284
Arc::clone(self.internal_disks_rx.mount_config()),
238285
self.dataset_task.clone(),
239286
key_requester,
240287
time_sync_config,
241-
current_config_rx,
288+
ledger_rx,
242289
reconciler_result_tx,
243290
currently_managed_zpools_tx,
244291
self.internal_disks_rx.clone(),

sled-agent/config-reconciler/src/ledger.rs

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ impl LedgerTaskHandle {
127127
internal_disks_rx: InternalDisksReceiver,
128128
artifact_store: T,
129129
log: Logger,
130-
) -> (Self, watch::Receiver<CurrentSledConfig>) {
130+
) -> (Self, watch::Receiver<CurrentSledConfig>, oneshot::Receiver<()>) {
131131
// We only accept two kinds of requests on this channel, both of which
132132
// come from HTTP requests to sled agent:
133133
//
@@ -152,6 +152,11 @@ impl LedgerTaskHandle {
152152
let (current_config_tx, current_config_rx) =
153153
watch::channel(CurrentSledConfig::WaitingForInternalDisks);
154154

155+
// The measurement handler relies on the ledger task running.
156+
// Give a channel to wait for that to happen instead of relying
157+
// on polling.
158+
let (ledger_run_tx, ledger_run_rx) = oneshot::channel();
159+
155160
tokio::spawn(
156161
LedgerTask {
157162
artifact_store,
@@ -160,12 +165,13 @@ impl LedgerTaskHandle {
160165
current_config_tx,
161166
log,
162167
}
163-
.run(),
168+
.run(ledger_run_tx),
164169
);
165170

166171
(
167172
Self { request_tx, current_config_rx: current_config_rx.clone() },
168173
current_config_rx,
174+
ledger_run_rx,
169175
)
170176
}
171177

@@ -255,10 +261,10 @@ struct LedgerTask<T> {
255261
}
256262

257263
impl<T: SledAgentArtifactStore> LedgerTask<T> {
258-
async fn run(self) {
264+
async fn run(self, ledger_run_tx: oneshot::Sender<()>) {
259265
// This pattern match looks strange, but `run_impl()` cannot return
260266
// `Ok(_)`; it must run forever (or until failure).
261-
let Err((log, err)) = self.run_impl().await;
267+
let Err((log, err)) = self.run_impl(ledger_run_tx).await;
262268
error!(
263269
log,
264270
"LedgerTask::run() unexpectedly exited; this should only be \
@@ -269,6 +275,7 @@ impl<T: SledAgentArtifactStore> LedgerTask<T> {
269275

270276
async fn run_impl(
271277
mut self,
278+
ledger_run_tx: oneshot::Sender<()>,
272279
) -> Result<Infallible, (Logger, LedgerTaskExit)> {
273280
// We created `self.current_config_tx` in `spawn_ledger_task()` and own
274281
// the only sender, so it should start out in the `WaitingForM2Disks`
@@ -283,6 +290,9 @@ impl<T: SledAgentArtifactStore> LedgerTask<T> {
283290
CurrentSledConfig::WaitingForInternalDisks
284291
);
285292

293+
// We've gotten far enough that our disks should be ready!
294+
let _ = ledger_run_tx.send(());
295+
286296
loop {
287297
let Some(request) = self.request_rx.recv().await else {
288298
return Err((
@@ -800,13 +810,16 @@ mod tests {
800810
}
801811

802812
// Spawn the ledger task.
803-
let (task_handle, mut current_config_rx) =
813+
let (task_handle, mut current_config_rx, ledger_task_run_rx) =
804814
LedgerTaskHandle::spawn_ledger_task(
805815
internal_disks_rx.clone(),
806816
fake_artifact_store,
807817
log,
808818
);
809819

820+
// This better run!
821+
let _ = ledger_task_run_rx.await.unwrap();
822+
810823
// Wait for the task to check our fake disk and progress to either
811824
// `Ledgered` (if we copied in a config) or
812825
// `WaitingForInitialConfig` (if we didn't).
@@ -896,7 +909,7 @@ mod tests {
896909

897910
// Spawn the ledger task. It should sit in the `WaitingForInternalDisks`
898911
// state.
899-
let (_task_handle, mut current_config_rx) =
912+
let (_task_handle, mut current_config_rx, ledger_task_run_rx) =
900913
LedgerTaskHandle::spawn_ledger_task(
901914
internal_disks_rx,
902915
FakeArtifactStore::default(),
@@ -911,6 +924,8 @@ mod tests {
911924
// Populate a fake disk.
912925
disks_tx.send(vec![make_fake_disk()]).expect("receiver still exists");
913926

927+
let _ = ledger_task_run_rx.await.unwrap();
928+
914929
// Confirm the ledger task notices and progresses to
915930
// `WaitingForInitialConfig`.
916931
wait_for_watch_channel_condition(

sled-agent/config-reconciler/src/reconciler_task.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,6 @@ impl LatestReconciliationResult {
236236
zones: self.zones_inventory.clone(),
237237
boot_partitions: self.boot_partitions.clone(),
238238
remove_mupdate_override: self.remove_mupdate_override.clone(),
239-
// TODO: this will come in another PR
240239
measurements: IdOrdMap::new(),
241240
}
242241
}

0 commit comments

Comments
 (0)