Skip to content

Commit e050434

Browse files
authored
[reconfigurator] Fix planner RoT active slot bug (#8893)
Fixes a bug where the planner was using the expected active slot as the found active slot for doing status checks. Closes: #8887
1 parent d196e0c commit e050434

File tree

3 files changed

+99
-53
lines changed

3 files changed

+99
-53
lines changed

nexus/reconfigurator/planning/src/mgs_updates/mod.rs

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -338,45 +338,36 @@ fn mgs_update_status(
338338
expected_pending_persistent_boot_preference,
339339
expected_transient_boot_preference,
340340
}) => {
341-
let active_caboose_which = match &expected_active_slot.slot {
342-
RotSlot::A => CabooseWhich::RotSlotA,
343-
RotSlot::B => CabooseWhich::RotSlotB,
344-
};
345-
346-
let Some(active_caboose) =
347-
inventory.caboose_for(active_caboose_which, baseboard_id)
348-
else {
349-
return Err(MgsUpdateStatusError::MissingActiveCaboose);
350-
};
351-
352-
let found_inactive_version = inventory
353-
.caboose_for(active_caboose_which.toggled_slot(), baseboard_id)
354-
.map(|c| c.caboose.version.as_ref());
355-
356341
let rot_state = inventory
357342
.rots
358343
.get(baseboard_id)
359344
.ok_or(MgsUpdateStatusError::MissingRotState)?;
360345

346+
let active_slot = rot_state.active_slot;
347+
348+
let active_caboose_which = match &active_slot {
349+
RotSlot::A => CabooseWhich::RotSlotA,
350+
RotSlot::B => CabooseWhich::RotSlotB,
351+
};
352+
353+
let active_caboose = inventory
354+
.caboose_for(active_caboose_which, baseboard_id)
355+
.ok_or(MgsUpdateStatusError::MissingActiveCaboose)?;
356+
361357
let found_active_version =
362358
ArtifactVersion::new(active_caboose.caboose.version.clone())
363359
.map_err(|e| {
364360
MgsUpdateStatusError::FailedArtifactVersionParse(e)
365361
})?;
366362

367363
let found_active_slot = ExpectedActiveRotSlot {
368-
slot: rot_state.active_slot,
364+
slot: active_slot,
369365
version: found_active_version,
370366
};
371367

372-
let expected = RotUpdateState {
373-
active_slot: expected_active_slot.clone(),
374-
persistent_boot_preference:
375-
*expected_persistent_boot_preference,
376-
pending_persistent_boot_preference:
377-
*expected_pending_persistent_boot_preference,
378-
transient_boot_preference: *expected_transient_boot_preference,
379-
};
368+
let found_inactive_version = inventory
369+
.caboose_for(active_caboose_which.toggled_slot(), baseboard_id)
370+
.map(|c| c.caboose.version.as_ref());
380371

381372
let found = RotUpdateState {
382373
active_slot: found_active_slot,
@@ -387,6 +378,15 @@ fn mgs_update_status(
387378
transient_boot_preference: rot_state.transient_boot_preference,
388379
};
389380

381+
let expected = RotUpdateState {
382+
active_slot: expected_active_slot.clone(),
383+
persistent_boot_preference:
384+
*expected_persistent_boot_preference,
385+
pending_persistent_boot_preference:
386+
*expected_pending_persistent_boot_preference,
387+
transient_boot_preference: *expected_transient_boot_preference,
388+
};
389+
390390
mgs_update_status_rot(
391391
desired_version,
392392
expected,
@@ -549,6 +549,7 @@ mod test {
549549
use dropshot::ConfigLogging;
550550
use dropshot::ConfigLoggingLevel;
551551
use gateway_client::types::SpType;
552+
use gateway_types::rot::RotSlot;
552553
use nexus_types::deployment::ExpectedVersion;
553554
use nexus_types::deployment::PendingMgsUpdateDetails;
554555
use nexus_types::deployment::PendingMgsUpdateRotBootloaderDetails;
@@ -916,15 +917,22 @@ mod test {
916917
assert_eq!(updates, later_updates);
917918

918919
// At this point, we're ready to test that when the first SpType update
919-
// completes, then the second one *is* started. This tests three
920+
// completes, then the second one *is* started. This tests four
920921
// different things: first that we noticed the first one completed,
921-
// second that we noticed another thing needed an update, and third that
922-
// the planner schedules the updates in the correct order: first RoT,
923-
// and second SP.
922+
// second that we noticed another thing needed an update, third that
923+
// an update is correctly configured after the active slot has changed
924+
// from the previous component update, and fourth that the planner
925+
// schedules the updates in the correct order: first RoT, and second SP.
924926
let later_collection = test_boards
925927
.collection_builder()
926928
.sp_active_version_exception(SpType::Switch, 1, ARTIFACT_VERSION_1)
927929
.rot_active_version_exception(SpType::Switch, 1, ARTIFACT_VERSION_1)
930+
.rot_active_slot_exception(SpType::Sled, 0, RotSlot::B)
931+
.rot_persistent_boot_preference_exception(
932+
SpType::Sled,
933+
0,
934+
RotSlot::B,
935+
)
928936
.build();
929937
let PlannedMgsUpdates { pending_updates: later_updates, .. } =
930938
plan_mgs_updates(

nexus/reconfigurator/planning/src/mgs_updates/rot.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ pub fn mgs_update_status_rot(
8282
// should not proceed. Once https://github.com/oxidecomputer/hubris/pull/2050
8383
// is implemented, we should revist this check
8484
if found_persistent_boot_preference != found_active_slot.slot
85+
|| expected_persistent_boot_preference != expected_active_slot.slot
8586
|| found_transient_boot_preference.is_some()
8687
|| expected_transient_boot_preference.is_some()
8788
{

nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs

Lines changed: 62 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,8 @@ pub(super) struct TestBoardCollectionBuilder<'a> {
744744
sp_active_version_exceptions: BTreeMap<SpIdentifier, ArtifactVersion>,
745745
rot_active_version_exceptions: BTreeMap<SpIdentifier, ArtifactVersion>,
746746
stage0_version_exceptions: BTreeMap<SpIdentifier, ArtifactVersion>,
747+
rot_active_slot_exceptions: BTreeMap<SpIdentifier, RotSlot>,
748+
rot_persistent_boot_preference_exceptions: BTreeMap<SpIdentifier, RotSlot>,
747749

748750
// host exceptions are keyed only by slot; they only apply to sleds.
749751
host_exceptions: BTreeMap<u16, HostOsException>,
@@ -774,6 +776,8 @@ impl<'a> TestBoardCollectionBuilder<'a> {
774776
sp_active_version_exceptions: BTreeMap::new(),
775777
rot_active_version_exceptions: BTreeMap::new(),
776778
stage0_version_exceptions: BTreeMap::new(),
779+
rot_active_slot_exceptions: BTreeMap::new(),
780+
rot_persistent_boot_preference_exceptions: BTreeMap::new(),
777781
host_exceptions: BTreeMap::new(),
778782
}
779783
}
@@ -829,6 +833,28 @@ impl<'a> TestBoardCollectionBuilder<'a> {
829833
self
830834
}
831835

836+
pub fn rot_active_slot_exception(
837+
mut self,
838+
type_: SpType,
839+
sp_slot: u16,
840+
s: RotSlot,
841+
) -> Self {
842+
self.rot_active_slot_exceptions
843+
.insert(SpIdentifier { type_, slot: sp_slot }, s);
844+
self
845+
}
846+
847+
pub fn rot_persistent_boot_preference_exception(
848+
mut self,
849+
type_: SpType,
850+
sp_slot: u16,
851+
s: RotSlot,
852+
) -> Self {
853+
self.rot_persistent_boot_preference_exceptions
854+
.insert(SpIdentifier { type_, slot: sp_slot }, s);
855+
self
856+
}
857+
832858
pub fn has_rot_active_version_exception(
833859
&self,
834860
type_: SpType,
@@ -906,29 +932,6 @@ impl<'a> TestBoardCollectionBuilder<'a> {
906932
let mut builder =
907933
nexus_inventory::CollectionBuilder::new(self.boards.test_name);
908934

909-
let dummy_sp_state = SpState {
910-
base_mac_address: [0; 6],
911-
hubris_archive_id: String::from("unused"),
912-
model: String::from("unused"),
913-
power_state: PowerState::A0,
914-
revision: 0,
915-
rot: RotState::V3 {
916-
active: RotSlot::A,
917-
pending_persistent_boot_preference: None,
918-
persistent_boot_preference: RotSlot::A,
919-
slot_a_error: None,
920-
slot_a_fwid: Default::default(),
921-
slot_b_error: None,
922-
slot_b_fwid: Default::default(),
923-
stage0_error: None,
924-
stage0_fwid: Default::default(),
925-
stage0next_error: None,
926-
stage0next_fwid: Default::default(),
927-
transient_boot_preference: None,
928-
},
929-
serial_number: String::from("unused"),
930-
};
931-
932935
for board in &self.boards.boards {
933936
let &TestBoard {
934937
id: sp_id,
@@ -939,6 +942,40 @@ impl<'a> TestBoardCollectionBuilder<'a> {
939942
rot_sign: rkth,
940943
} = board;
941944

945+
let rot_active_slot = self
946+
.rot_active_slot_exceptions
947+
.get(&sp_id)
948+
.cloned()
949+
.unwrap_or(RotSlot::A);
950+
let rot_persistent_boot_preference = self
951+
.rot_persistent_boot_preference_exceptions
952+
.get(&sp_id)
953+
.cloned()
954+
.unwrap_or(RotSlot::A);
955+
956+
let dummy_sp_state = SpState {
957+
base_mac_address: [0; 6],
958+
hubris_archive_id: String::from("unused"),
959+
model: String::from("unused"),
960+
power_state: PowerState::A0,
961+
revision: 0,
962+
rot: RotState::V3 {
963+
active: rot_active_slot,
964+
pending_persistent_boot_preference: None,
965+
persistent_boot_preference: rot_persistent_boot_preference,
966+
slot_a_error: None,
967+
slot_a_fwid: Default::default(),
968+
slot_b_error: None,
969+
slot_b_fwid: Default::default(),
970+
stage0_error: None,
971+
stage0_fwid: Default::default(),
972+
stage0next_error: None,
973+
stage0next_fwid: Default::default(),
974+
transient_boot_preference: None,
975+
},
976+
serial_number: String::from("unused"),
977+
};
978+
942979
let sp_state = SpState {
943980
model: format!("dummy_{}", sp_id.type_),
944981
serial_number: serial.to_string(),
@@ -985,7 +1022,7 @@ impl<'a> TestBoardCollectionBuilder<'a> {
9851022
builder
9861023
.found_caboose(
9871024
&baseboard_id,
988-
CabooseWhich::RotSlotA,
1025+
CabooseWhich::from_rot_slot(rot_active_slot),
9891026
"test",
9901027
SpComponentCaboose {
9911028
board: caboose_rot_board.to_string(),
@@ -1040,7 +1077,7 @@ impl<'a> TestBoardCollectionBuilder<'a> {
10401077
builder
10411078
.found_caboose(
10421079
&baseboard_id,
1043-
CabooseWhich::RotSlotB,
1080+
CabooseWhich::from_rot_slot(rot_active_slot.toggled()),
10441081
"test",
10451082
SpComponentCaboose {
10461083
board: caboose_rot_board.to_string(),

0 commit comments

Comments
 (0)