Skip to content

Commit 347bf8d

Browse files
authored
[mgs-updates] Ensure boot info is available after an RoT reset (#8905)
By waiting for boot info to be available, a `sprot: timeout` error will be near impossible.
1 parent 58ee63b commit 347bf8d

File tree

3 files changed

+158
-92
lines changed

3 files changed

+158
-92
lines changed

nexus/mgs-updates/src/driver_update.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -667,13 +667,14 @@ fn post_update_timeout(update: &PendingMgsUpdate) -> Duration {
667667
}
668668
}
669669
PendingMgsUpdateDetails::Rot { .. } => {
670-
// Resetting the RoT should be quick (a few seconds).
671-
Duration::from_secs(60)
670+
// Resetting the RoT should be quick (a few seconds), but we wait
671+
// for boot info after the reset.
672+
Duration::from_secs(90)
672673
}
673674
PendingMgsUpdateDetails::RotBootloader { .. } => {
674675
// Resetting the bootloader requires multiple RoT resets; give this
675676
// a longer timeout.
676-
Duration::from_secs(180)
677+
Duration::from_secs(210)
677678
}
678679
PendingMgsUpdateDetails::HostPhase1(..) => {
679680
// Resetting a sled takes several minutes (mostly DRAM training);

nexus/mgs-updates/src/rot_bootloader_updater.rs

Lines changed: 47 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,21 @@ use crate::common_sp_update::PrecheckError;
1212
use crate::common_sp_update::PrecheckStatus;
1313
use crate::common_sp_update::error_means_caboose_is_invalid;
1414
use crate::mgs_clients::GatewayClientError;
15+
use crate::rot_updater::WAIT_FOR_BOOT_INFO_TIMEOUT;
16+
use crate::rot_updater::wait_for_boot_info;
1517
use futures::FutureExt;
1618
use futures::future::BoxFuture;
1719
use gateway_client::SpComponent;
18-
use gateway_client::types::GetRotBootInfoParams;
1920
use gateway_client::types::RotImageError;
2021
use gateway_client::types::RotState;
2122
use gateway_client::types::SpComponentFirmwareSlot;
2223
use gateway_client::types::SpType;
23-
use gateway_messages::RotBootInfo;
2424
use nexus_types::deployment::PendingMgsUpdate;
2525
use nexus_types::deployment::PendingMgsUpdateRotBootloaderDetails;
2626
use slog::Logger;
27-
use slog::{debug, error, info};
27+
use slog::{debug, error};
2828
use slog_error_chain::InlineErrorChain;
2929
use std::time::Duration;
30-
use std::time::Instant;
31-
32-
const WAIT_FOR_BOOT_INFO_TIMEOUT: Duration = Duration::from_secs(120);
33-
34-
const WAIT_FOR_BOOT_INFO_INTERVAL: Duration = Duration::from_secs(10);
3530

3631
pub struct ReconfiguratorRotBootloaderUpdater {
3732
details: PendingMgsUpdateRotBootloaderDetails,
@@ -193,10 +188,6 @@ impl SpComponentUpdateHelperImpl for ReconfiguratorRotBootloaderUpdater {
193188

194189
// We now retrieve boot info from the RoT to verify the reset
195190
// has completed and signature checks done.
196-
debug!(
197-
log,
198-
"attempting to retrieve boot info to verify image validity"
199-
);
200191
let stage0next_error = wait_for_stage0_next_image_check(
201192
log,
202193
mgs_clients,
@@ -252,7 +243,10 @@ impl SpComponentUpdateHelperImpl for ReconfiguratorRotBootloaderUpdater {
252243
})
253244
.await?;
254245

255-
debug!(log, "attempting to reset device to set to new RoT bootloader version");
246+
debug!(
247+
log,
248+
"attempting to reset the device to set a new RoT bootloader version",
249+
);
256250
mgs_clients
257251
.try_all_serially(log, move |mgs_client| async move {
258252
mgs_client
@@ -265,96 +259,62 @@ impl SpComponentUpdateHelperImpl for ReconfiguratorRotBootloaderUpdater {
265259
})
266260
.await?;
267261

262+
// We wait for boot info to ensure a successful reset
263+
wait_for_boot_info(
264+
log,
265+
mgs_clients,
266+
update.sp_type,
267+
update.slot_id,
268+
WAIT_FOR_BOOT_INFO_TIMEOUT,
269+
)
270+
.await?;
268271
Ok(())
269272
}
270273
.boxed()
271274
}
272275
}
273276

274277
/// Poll the RoT asking for its boot information. This is used to check
275-
/// state after RoT bootloader updates
278+
/// the state for RoT bootloader image errors after RoT is reset
276279
async fn wait_for_stage0_next_image_check(
277280
log: &Logger,
278281
mgs_clients: &mut MgsClients,
279282
sp_type: SpType,
280283
sp_slot: u16,
281284
timeout: Duration,
282285
) -> Result<Option<RotImageError>, PostUpdateError> {
283-
let before = Instant::now();
284-
loop {
285-
match mgs_clients
286-
.try_all_serially(log, |mgs_client| async move {
287-
mgs_client
288-
.sp_rot_boot_info(
289-
sp_type,
290-
sp_slot,
291-
SpComponent::ROT.const_as_str(),
292-
&GetRotBootInfoParams {
293-
version: RotBootInfo::HIGHEST_KNOWN_VERSION,
294-
},
295-
)
296-
.await
297-
})
298-
.await
299-
{
300-
Ok(state) => match state.into_inner() {
301-
// The minimum we will ever return is v3.
302-
// Additionally, V2 does not report image errors, so we cannot
303-
// know with certainty if a signature check came back with errors
304-
RotState::V2 { .. } => {
305-
let error = "unexpected RoT version: 2".to_string();
306-
error!(
307-
log,
308-
"failed to get RoT boot info";
309-
"error" => &error
310-
);
311-
return Err(PostUpdateError::FatalError { error });
312-
}
313-
RotState::V3 { stage0next_error, .. } => {
314-
return Ok(stage0next_error);
315-
}
316-
// The RoT is probably still booting
317-
RotState::CommunicationFailed { message } => {
318-
if before.elapsed() >= timeout {
319-
error!(
320-
log,
321-
"failed to get RoT boot info";
322-
"error" => %message
323-
);
324-
return Err(PostUpdateError::FatalError {
325-
error: message,
326-
});
327-
}
328-
329-
info!(
330-
log,
331-
"failed getting RoT boot info (will retry)";
332-
"error" => %message,
333-
);
334-
tokio::time::sleep(WAIT_FOR_BOOT_INFO_INTERVAL).await;
335-
}
336-
},
337-
// The RoT might still be booting
338-
Err(error) => {
339-
let e = InlineErrorChain::new(&error);
340-
if before.elapsed() >= timeout {
341-
error!(
342-
log,
343-
"failed to get RoT boot info";
344-
&e,
345-
);
346-
return Err(PostUpdateError::FatalError {
347-
error: e.to_string(),
348-
});
349-
}
350-
351-
info!(
286+
debug!(log, "attempting to verify image validity");
287+
match wait_for_boot_info(log, mgs_clients, sp_type, sp_slot, timeout).await
288+
{
289+
Ok(state) => match state {
290+
// The minimum we will ever return is v3.
291+
// Additionally, V2 does not report image errors, so we cannot
292+
// know with certainty if a signature check came back with errors
293+
RotState::V2 { .. } => {
294+
let error = "unexpected RoT version: 2".to_string();
295+
error!(
352296
log,
353-
"failed getting RoT boot info (will retry)";
354-
e,
297+
"failed to get RoT boot info";
298+
"error" => &error
355299
);
356-
tokio::time::sleep(WAIT_FOR_BOOT_INFO_INTERVAL).await;
300+
return Err(PostUpdateError::FatalError { error });
357301
}
358-
}
302+
RotState::V3 { stage0next_error, .. } => {
303+
debug!(log, "successfully completed an image signature check");
304+
return Ok(stage0next_error);
305+
}
306+
// This is unreachable because wait_for_boot_info loops for some
307+
// time if it encounters `CommunicationFailed`, and if it hits the
308+
// timeout, it will return an error.
309+
RotState::CommunicationFailed { message } => {
310+
error!(
311+
log,
312+
"failed to get RoT boot info";
313+
"error" => %message
314+
);
315+
return Err(PostUpdateError::FatalError { error: message });
316+
}
317+
},
318+
Err(error) => return Err(error),
359319
}
360320
}

nexus/mgs-updates/src/rot_updater.rs

Lines changed: 107 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,22 @@ use crate::common_sp_update::error_means_caboose_is_invalid;
1414
use futures::FutureExt;
1515
use futures::future::BoxFuture;
1616
use gateway_client::SpComponent;
17+
use gateway_client::types::GetRotBootInfoParams;
1718
use gateway_client::types::RotState;
1819
use gateway_client::types::SpComponentFirmwareSlot;
20+
use gateway_client::types::SpType;
21+
use gateway_messages::RotBootInfo;
1922
use nexus_types::deployment::PendingMgsUpdate;
2023
use nexus_types::deployment::PendingMgsUpdateRotDetails;
21-
use slog::{debug, info};
24+
use slog::Logger;
25+
use slog::{debug, error, info};
26+
use slog_error_chain::InlineErrorChain;
27+
use std::time::Duration;
28+
use std::time::Instant;
29+
30+
pub const WAIT_FOR_BOOT_INFO_TIMEOUT: Duration = Duration::from_secs(120);
31+
32+
const WAIT_FOR_BOOT_INFO_INTERVAL: Duration = Duration::from_secs(10);
2233

2334
type GatewayClientError = gateway_client::Error<gateway_client::types::Error>;
2435

@@ -223,7 +234,10 @@ impl SpComponentUpdateHelperImpl for ReconfiguratorRotUpdater {
223234
})
224235
.await?;
225236

226-
debug!(log, "attempting to reset device");
237+
debug!(
238+
log,
239+
"attempting to reset the device to set a new RoT version"
240+
);
227241
mgs_clients
228242
.try_all_serially(log, move |mgs_client| async move {
229243
mgs_client
@@ -235,8 +249,99 @@ impl SpComponentUpdateHelperImpl for ReconfiguratorRotUpdater {
235249
.await
236250
})
237251
.await?;
252+
253+
// We wait for boot info to ensure a successful reset
254+
wait_for_boot_info(
255+
log,
256+
mgs_clients,
257+
update.sp_type,
258+
update.slot_id,
259+
WAIT_FOR_BOOT_INFO_TIMEOUT,
260+
)
261+
.await?;
238262
Ok(())
239263
}
240264
.boxed()
241265
}
242266
}
267+
268+
/// Poll the RoT asking for its boot information. This confirms that the RoT has
269+
/// been succesfully reset
270+
pub async fn wait_for_boot_info(
271+
log: &Logger,
272+
mgs_clients: &mut MgsClients,
273+
sp_type: SpType,
274+
sp_slot: u16,
275+
timeout: Duration,
276+
) -> Result<RotState, PostUpdateError> {
277+
let before = Instant::now();
278+
loop {
279+
debug!(log, "waiting for boot info to confirm a successful reset");
280+
match mgs_clients
281+
.try_all_serially(log, |mgs_client| async move {
282+
mgs_client
283+
.sp_rot_boot_info(
284+
sp_type,
285+
sp_slot,
286+
SpComponent::ROT.const_as_str(),
287+
&GetRotBootInfoParams {
288+
version: RotBootInfo::HIGHEST_KNOWN_VERSION,
289+
},
290+
)
291+
.await
292+
})
293+
.await
294+
{
295+
Ok(state) => match state.clone() {
296+
// The minimum we will ever return is v3.
297+
// Additionally, V2 does not report image errors, so we cannot
298+
// know with certainty if a signature check came back with errors
299+
RotState::V2 { .. } | RotState::V3 { .. } => {
300+
debug!(log, "successfuly retrieved boot info");
301+
return Ok(state.into_inner());
302+
}
303+
// The RoT is probably still booting
304+
RotState::CommunicationFailed { message } => {
305+
if before.elapsed() >= timeout {
306+
error!(
307+
log,
308+
"failed to get RoT boot info";
309+
"error" => %message
310+
);
311+
return Err(PostUpdateError::FatalError {
312+
error: message,
313+
});
314+
}
315+
316+
info!(
317+
log,
318+
"failed getting RoT boot info (will retry)";
319+
"error" => %message,
320+
);
321+
tokio::time::sleep(WAIT_FOR_BOOT_INFO_INTERVAL).await;
322+
}
323+
},
324+
// The RoT might still be booting
325+
Err(error) => {
326+
let e = InlineErrorChain::new(&error);
327+
if before.elapsed() >= timeout {
328+
error!(
329+
log,
330+
"failed to get RoT boot info";
331+
&e,
332+
);
333+
return Err(PostUpdateError::FatalError {
334+
error: e.to_string(),
335+
});
336+
}
337+
338+
info!(
339+
log,
340+
"failed getting RoT boot info (will retry)";
341+
e,
342+
);
343+
tokio::time::sleep(WAIT_FOR_BOOT_INFO_INTERVAL).await;
344+
}
345+
}
346+
}
347+
}

0 commit comments

Comments
 (0)