oxidecomputer · karencfv · Jul 9, 2025 · Jun 12, 2025 · Jun 12, 2025 · Jun 13, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/clients/gateway-client/Cargo.toml b/clients/gateway-client/Cargo.toml
@@ -21,5 +21,6 @@ serde.workspace = true
 serde_json.workspace = true
 schemars.workspace = true
 slog.workspace = true
+thiserror.workspace = true
 uuid.workspace = true
 omicron-workspace-hack.workspace = true
diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs
@@ -63,7 +63,7 @@ progenitor::generate_api!(
         HostPhase2RecoveryImageId = { derives = [PartialEq, Eq, PartialOrd, Ord] },
         ImageVersion = { derives = [PartialEq, Eq, PartialOrd, Ord] },
         RotImageDetails = { derives = [PartialEq, Eq, PartialOrd, Ord] },
-        RotImageError = { derives = [ PartialEq, Eq, PartialOrd, Ord] },
+        RotImageError = { derives = [ thiserror::Error, PartialEq, Eq, PartialOrd, Ord] },
         RotState = { derives = [PartialEq, Eq, PartialOrd, Ord] },
         SpComponentCaboose = { derives = [PartialEq, Eq] },
         SpIdentifier = { derives = [Copy, PartialEq, Hash, Eq] },

diff --git a/nexus/mgs-updates/Cargo.toml b/nexus/mgs-updates/Cargo.toml
@@ -11,6 +11,7 @@ chrono.workspace = true
 futures.workspace = true
 gateway-client.workspace = true
 gateway-types.workspace = true
+gateway-messages.workspace = true
 id-map.workspace = true
 internal-dns-resolver.workspace = true
 internal-dns-types.workspace = true

diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs
@@ -8,6 +8,7 @@
 use super::MgsClients;
 use super::UpdateProgress;
 use futures::future::BoxFuture;
+use gateway_client::types::RotImageError;
 use gateway_client::types::SpType;
 use gateway_client::types::SpUpdateStatus;
 use gateway_types::rot::RotSlot;
@@ -267,14 +268,15 @@ pub trait SpComponentUpdateHelper {
         log: &'a slog::Logger,
         mgs_clients: &'a mut MgsClients,
         update: &'a PendingMgsUpdate,
-    ) -> BoxFuture<'a, Result<(), GatewayClientError>>;
+    ) -> BoxFuture<'a, Result<(), PostUpdateError>>;
 }
 
 /// Describes the live state of the component before the update begins
 #[derive(Debug)]
 pub enum PrecheckStatus {
     UpdateComplete,
     ReadyForUpdate,
+    WaitingForOngoingRotBootloaderUpdate,
 }
 
 #[derive(Debug, Error)]
@@ -319,6 +321,18 @@ pub enum PrecheckError {
     WrongInactiveVersion { expected: ExpectedVersion, found: FoundVersion },
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum PostUpdateError {
+    #[error("communicating with MGS")]
+    GatewayClientError(#[from] GatewayClientError),
+
+    #[error("communicating with RoT: {message:?}")]
+    RotCommunicationFailed { message: String },
+
+    #[error("invalid RoT bootloader image: {error:?}")]
+    RotBootloaderImageError { error: RotImageError },
+}
+
 #[derive(Debug)]
 pub enum FoundVersion {
     MissingVersion,

diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs
@@ -4,6 +4,7 @@
 
 //! Concurrent-safe facilities for doing MGS-managed upates
 
+use crate::common_sp_update::PostUpdateError;
 use crate::common_sp_update::PrecheckError;
 use crate::common_sp_update::PrecheckStatus;
 use crate::common_sp_update::STATUS_POLL_INTERVAL;
@@ -32,7 +33,7 @@ use uuid::Uuid;
 
 /// How long may the status remain unchanged without us treating this as a
 /// problem?
-pub const PROGRESS_TIMEOUT: Duration = Duration::from_secs(120);
+pub const PROGRESS_TIMEOUT: Duration = Duration::from_secs(180);
 
 /// How long to wait between failed attempts to reset the device
 const RESET_DELAY_INTERVAL: Duration = Duration::from_secs(10);
@@ -46,6 +47,14 @@ pub const DEFAULT_RETRY_TIMEOUT: Duration = Duration::from_secs(60);
 /// How long to wait after resetting the device before expecting it to come up
 const RESET_TIMEOUT: Duration = Duration::from_secs(60);
 
+/// How long to wait for an ongoing RoT bootloader update
+const WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT: Duration =
+    Duration::from_secs(180);
+
+/// How long to wait between poll attempts on RoT bootloader update status
+const ROT_BOOLOADER_UPDATE_PROGRESS_INTERVAL: Duration =
+    Duration::from_secs(10);
+
 /// Parameters describing a request to update one SP-managed component
 ///
 /// This is similar in spirit to the `SpComponentUpdater` trait but uses a
@@ -216,7 +225,11 @@ pub(crate) async fn apply_update(
     // - if not, then if our required preconditions are met
     status.update(UpdateAttemptStatus::Precheck);
     match update_helper.precheck(log, &mut mgs_clients, update).await {
-        Ok(PrecheckStatus::ReadyForUpdate) => (),
+        Ok(PrecheckStatus::ReadyForUpdate) |
+        // This is the first time a Nexus instance is attempting to
+        // update the RoT bootloader, we don't need to wait for an
+        // ongoing update.
+        Ok(PrecheckStatus::WaitingForOngoingRotBootloaderUpdate) => (),
         Ok(PrecheckStatus::UpdateComplete) => {
             return Ok(UpdateCompletedHow::FoundNoChangesNeeded);
         }
@@ -349,16 +362,33 @@ pub(crate) async fn apply_update(
 
     if try_reset {
         // We retry this until we get some error *other* than a communication
-        // error.  There is intentionally no timeout here.  If we've staged an
-        // update but not managed to reset the device, there's no point where
-        // we'd want to stop trying to do so.
+        // error or an RoT bootloader image error.  There is intentionally no
+        // timeout here.  If we've staged an update but not managed to reset
+        // the device, there's no point where we'd want to stop trying to do so.
-        // error or an RoT bootloader image error.  There is intentionally no
-        // timeout here.  If we've staged an update but not managed to reset
-        // the device, there's no point where we'd want to stop trying to do so.
+        // error or some other transient error.  There is intentionally no
+        // timeout here.  If we've staged an update but not managed to reset
+        // the device, there's no point where we'd want to stop trying to do so.
-        // error or an RoT bootloader image error.  There is intentionally no
-        // timeout here.  If we've staged an update but not managed to reset
-        // the device, there's no point where we'd want to stop trying to do so.
+        // error or some other transient error.  There is intentionally no
+        // timeout here.  If we've staged an update but not managed to reset
+        // the device, there's no point where we'd want to stop trying to do so.
         while let Err(error) =
             update_helper.post_update(log, &mut mgs_clients, update).await
         {
-            if !matches!(error, gateway_client::Error::CommunicationError(_)) {
-                let error = InlineErrorChain::new(&error);
-                error!(log, "post_update failed"; &error);
-                return Err(ApplyUpdateError::SpResetFailed(error.to_string()));
+            match error {
+                PostUpdateError::GatewayClientError(error) => {
+                    if !matches!(
+                        error,
+                        gateway_client::Error::CommunicationError(_)
+                    ) {
+                        let error = InlineErrorChain::new(&error);
+                        error!(log, "post_update failed"; &error);
+                        return Err(ApplyUpdateError::SpResetFailed(
+                            error.to_string(),
+                        ));
+                    }
+                }
+                PostUpdateError::RotBootloaderImageError { error } => {
+                    let error = InlineErrorChain::new(&error);
+                    error!(log, "post_update failed"; &error);
+                    return Err(ApplyUpdateError::SpResetFailed(
+                        error.to_string(),
+                    ));
+                }
+                PostUpdateError::RotCommunicationFailed { message: _ } => {}
             }
 
             tokio::time::sleep(RESET_DELAY_INTERVAL).await;
@@ -598,6 +628,24 @@ async fn wait_for_update_done(
             // Check if we're done.
             Ok(PrecheckStatus::UpdateComplete) => return Ok(()),
 
+            // We'll loop for 3 minutes to wait for any ongoing RoT bootloader update.
+            // We need to wait for 2 resets which have a timeout of 60 seconds each,
+            // and an attempt to retrieve boot info, which has a time out of 30 seconds.
+            // We give an additional 30 seconds to as a buffer for the other actions.
+            Ok(PrecheckStatus::WaitingForOngoingRotBootloaderUpdate) => {
+                if before.elapsed()
+                    >= WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT
+                {
+                    return Err(UpdateWaitError::Timeout(
+                        WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT,
+                    ));
+                }
+
+                tokio::time::sleep(ROT_BOOLOADER_UPDATE_PROGRESS_INTERVAL)
+                    .await;
+                continue;
+            }
+
             // An incorrect version in the "inactive" slot, incorrect active slot,
             // or non-empty pending_persistent_boot_preference/transient_boot_preference
             // are normal during the upgrade. We have no reason to think these won't