Skip to content

Commit 064995c

Browse files
authored
[sp-sim] Enhancements to support host OS testing (#8731)
This is pulled out of the upcoming Reconfigurator host OS update work to make that PR slightly smaller. This has a few small enhancements to our SP simulator: * Simulated sleds now always report something for both phase 1 slots. The initial values for the two slots are different. (This isn't really necessary but makes debugging some tests easier, since without this both slots report the same hash value for their contents.) * We expose a count of the number of times we've performed a power state transition. (Will be used to test whether an update triggers a reboot.) * Implement `component_get_active_slot()` for host boot flash. * Simulated sled only: We expose a watch channel with the current power state, and if the state is A0, it also includes which host boot flash slot was active at the time we transitioned to A0. (We'll hook this up to a fake sled-agent so it can report a different boot disk after an sp-sim "reboot".)
1 parent cb110e2 commit 064995c

File tree

8 files changed

+203
-47
lines changed

8 files changed

+203
-47
lines changed

common/src/disk.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -626,6 +626,34 @@ pub enum M2Slot {
626626
B,
627627
}
628628

629+
impl M2Slot {
630+
/// Flip from `A` to `B` or vice versa.
631+
pub fn toggled(self) -> Self {
632+
match self {
633+
Self::A => Self::B,
634+
Self::B => Self::A,
635+
}
636+
}
637+
638+
/// Convert this slot to an MGS "firmware slot" index.
639+
pub fn to_mgs_firmware_slot(self) -> u16 {
640+
match self {
641+
Self::A => 0,
642+
Self::B => 1,
643+
}
644+
}
645+
646+
/// Convert a putative MGS "firmware slot" index to an `M2Slot`, returning
647+
/// `None` if `slot` is invalid.
648+
pub fn from_mgs_firmware_slot(slot: u16) -> Option<Self> {
649+
match slot {
650+
0 => Some(Self::A),
651+
1 => Some(Self::B),
652+
_ => None,
653+
}
654+
}
655+
}
656+
629657
impl fmt::Display for M2Slot {
630658
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
631659
match self {

dev-tools/omdb/tests/successes.out

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,8 @@ SPs FOUND THROUGH IGNITION
110110
SERVICE PROCESSOR STATES
111111

112112
TYPE SLOT MODEL SERIAL REV HUBRIS PWR ROT_ACTIVE
113-
Sled 0 i86pc SimGimlet00 0 0000000000000000 A2 slot A
114-
Sled 1 i86pc SimGimlet01 0 0000000000000000 A2 slot A
113+
Sled 0 i86pc SimGimlet00 0 0000000000000000 A0 slot A
114+
Sled 1 i86pc SimGimlet01 0 0000000000000000 A0 slot A
115115
Switch 0 FAKE_SIM_SIDECAR SimSidecar0 0 0000000000000000 A2 slot A
116116
Switch 1 FAKE_SIM_SIDECAR SimSidecar1 0 0000000000000000 A2 slot A
117117

nexus/mgs-updates/tests/host_phase1_hash.rs

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,21 @@ async fn test_host_phase1_hashing() {
128128
.await
129129
.expect("starting hashing while hashing should be okay");
130130

131-
// Calculate the hash we expect to see.
131+
// Calculate the hashes we expect to see.
132132
let expected_sha256_0 = Sha256::digest(
133-
sp_sim.last_host_phase1_update_data(0).await.as_deref().unwrap_or(&[]),
133+
sp_sim
134+
.host_phase1_data(0)
135+
.await
136+
.as_deref()
137+
.expect("sled should have data in slot 0"),
138+
)
139+
.into();
140+
let expected_sha256_1 = Sha256::digest(
141+
sp_sim
142+
.host_phase1_data(1)
143+
.await
144+
.as_deref()
145+
.expect("sled should have data in slot 1"),
134146
)
135147
.into();
136148

@@ -155,7 +167,7 @@ async fn test_host_phase1_hashing() {
155167
phase1_checker
156168
.assert_status(&[
157169
(0, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)),
158-
(1, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)),
170+
(1, ComponentFirmwareHashStatus::Hashed(expected_sha256_1)),
159171
])
160172
.await;
161173

@@ -222,7 +234,7 @@ async fn test_host_phase1_hashing() {
222234
}
223235

224236
// Confirm the simulator wrote the expected data in slot 1.
225-
let slot_1_data = sp_sim.last_host_phase1_update_data(1).await.unwrap();
237+
let slot_1_data = sp_sim.host_phase1_data(1).await.unwrap();
226238
assert_eq!(*slot_1_data, *fake_phase1);
227239

228240
// Writing an update should have put slot 1 back into the "needs hashing"

nexus/mgs-updates/tests/host_phase1_updater.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ async fn test_host_phase1_updater_updates_sled() {
6464

6565
// Ensure the SP received the complete update.
6666
let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize]
67-
.last_host_phase1_update_data(target_host_slot)
67+
.host_phase1_data(target_host_slot)
6868
.await
6969
.expect("simulated host phase1 did not receive an update");
7070

@@ -150,7 +150,7 @@ async fn test_host_phase1_updater_remembers_successful_mgs_instance() {
150150
host_phase1_updater.update(&mut mgs_clients).await.expect("update failed");
151151

152152
let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize]
153-
.last_host_phase1_update_data(target_host_slot)
153+
.host_phase1_data(target_host_slot)
154154
.await
155155
.expect("simulated host phase1 did not receive an update");
156156

@@ -361,7 +361,7 @@ async fn test_host_phase1_updater_switches_mgs_instances_on_failure() {
361361
);
362362

363363
let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize]
364-
.last_host_phase1_update_data(target_host_slot)
364+
.host_phase1_data(target_host_slot)
365365
.await
366366
.expect("simulated host phase1 did not receive an update");
367367

@@ -460,7 +460,7 @@ async fn test_host_phase1_updater_delivers_progress() {
460460
do_update_task.await.expect("update task panicked").expect("update failed");
461461

462462
let last_update_image = target_sp
463-
.last_host_phase1_update_data(target_host_slot)
463+
.host_phase1_data(target_host_slot)
464464
.await
465465
.expect("simulated host phase1 did not receive an update");
466466

sp-sim/src/gimlet.rs

Lines changed: 88 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,15 @@ use gateway_messages::{ComponentDetails, Message, MgsError, StartupOptions};
4848
use gateway_messages::{DiscoverResponse, IgnitionState, PowerState};
4949
use gateway_messages::{MessageKind, version};
5050
use gateway_types::component::SpState;
51+
use omicron_common::disk::M2Slot;
5152
use slog::{Logger, debug, error, info, warn};
5253
use std::cell::Cell;
5354
use std::collections::HashMap;
5455
use std::iter;
5556
use std::net::{SocketAddr, SocketAddrV6};
5657
use std::pin::Pin;
58+
use std::sync::atomic::AtomicUsize;
59+
use std::sync::atomic::Ordering;
5760
use std::sync::{Arc, Mutex};
5861
use tokio::io::{AsyncReadExt, AsyncWriteExt};
5962
use tokio::net::{TcpListener, TcpStream, UdpSocket};
@@ -86,6 +89,24 @@ pub enum SimSpHandledRequest {
8689
NotImplemented,
8790
}
8891

92+
/// Current power state and, if in A0, which M2 slot was active at the time
93+
/// we transitioned to A0. (This represents what disk the OS would attempt to
94+
/// boot from, if we were a real SP connected to a real sled.)
95+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
96+
pub enum GimletPowerState {
97+
A2,
98+
A0(M2Slot),
99+
}
100+
101+
impl From<GimletPowerState> for PowerState {
102+
fn from(value: GimletPowerState) -> Self {
103+
match value {
104+
GimletPowerState::A2 => Self::A2,
105+
GimletPowerState::A0(_) => Self::A0,
106+
}
107+
}
108+
}
109+
89110
pub struct Gimlet {
90111
local_addrs: Option<[SocketAddrV6; 2]>,
91112
ereport_addrs: Option<[SocketAddrV6; 2]>,
@@ -94,7 +115,9 @@ pub struct Gimlet {
94115
commands: mpsc::UnboundedSender<Command>,
95116
inner_tasks: Vec<JoinHandle<()>>,
96117
responses_sent_count: Option<watch::Receiver<usize>>,
118+
power_state_changes: Arc<AtomicUsize>,
97119
last_request_handled: Arc<Mutex<Option<SimSpHandledRequest>>>,
120+
power_state_rx: Option<watch::Receiver<GimletPowerState>>,
98121
}
99122

100123
impl Drop for Gimlet {
@@ -149,13 +172,10 @@ impl SimulatedSp for Gimlet {
149172
handler.update_state.last_rot_update_data()
150173
}
151174

152-
async fn last_host_phase1_update_data(
153-
&self,
154-
slot: u16,
155-
) -> Option<Box<[u8]>> {
175+
async fn host_phase1_data(&self, slot: u16) -> Option<Vec<u8>> {
156176
let handler = self.handler.as_ref()?;
157177
let handler = handler.lock().await;
158-
handler.update_state.last_host_phase1_update_data(slot)
178+
handler.update_state.host_phase1_data(slot)
159179
}
160180

161181
async fn current_update_status(&self) -> gateway_messages::UpdateStatus {
@@ -166,6 +186,10 @@ impl SimulatedSp for Gimlet {
166186
handler.lock().await.update_state.status()
167187
}
168188

189+
fn power_state_changes(&self) -> usize {
190+
self.power_state_changes.load(Ordering::Relaxed)
191+
}
192+
169193
fn responses_sent_count(&self) -> Option<watch::Receiver<usize>> {
170194
self.responses_sent_count.clone()
171195
}
@@ -235,6 +259,8 @@ impl Gimlet {
235259
inner_tasks,
236260
responses_sent_count: None,
237261
last_request_handled,
262+
power_state_rx: None,
263+
power_state_changes: Arc::new(AtomicUsize::new(0)),
238264
});
239265
};
240266

@@ -374,6 +400,9 @@ impl Gimlet {
374400
}
375401
}
376402
let local_addrs = [servers[0].local_addr(), servers[1].local_addr()];
403+
let (power_state, power_state_rx) =
404+
watch::channel(GimletPowerState::A0(M2Slot::A));
405+
let power_state_changes = Arc::new(AtomicUsize::new(0));
377406
let (inner, handler, responses_sent_count) = UdpTask::new(
378407
servers,
379408
ereport_servers,
@@ -382,11 +411,13 @@ impl Gimlet {
382411
attached_mgs,
383412
gimlet.common.serial_number.clone(),
384413
incoming_console_tx,
414+
power_state,
385415
commands_rx,
386416
Arc::clone(&last_request_handled),
387417
log,
388418
gimlet.common.old_rot_state,
389419
update_state,
420+
Arc::clone(&power_state_changes),
390421
);
391422
inner_tasks
392423
.push(task::spawn(async move { inner.run().await.unwrap() }));
@@ -400,9 +431,15 @@ impl Gimlet {
400431
inner_tasks,
401432
responses_sent_count: Some(responses_sent_count),
402433
last_request_handled,
434+
power_state_rx: Some(power_state_rx),
435+
power_state_changes,
403436
})
404437
}
405438

439+
pub fn power_state_rx(&self) -> Option<watch::Receiver<GimletPowerState>> {
440+
self.power_state_rx.clone()
441+
}
442+
406443
pub fn serial_console_addr(&self, component: &str) -> Option<SocketAddrV6> {
407444
self.serial_console_addrs.get(component).copied()
408445
}
@@ -625,21 +662,25 @@ impl UdpTask {
625662
attached_mgs: AttachedMgsSerialConsole,
626663
serial_number: String,
627664
incoming_serial_console: HashMap<SpComponent, UnboundedSender<Vec<u8>>>,
665+
power_state: watch::Sender<GimletPowerState>,
628666
commands: mpsc::UnboundedReceiver<Command>,
629667
last_request_handled: Arc<Mutex<Option<SimSpHandledRequest>>>,
630668
log: Logger,
631669
old_rot_state: bool,
632670
update_state: SimSpUpdate,
671+
power_state_changes: Arc<AtomicUsize>,
633672
) -> (Self, Arc<TokioMutex<Handler>>, watch::Receiver<usize>) {
634673
let [udp0, udp1] = servers;
635674
let handler = Arc::new(TokioMutex::new(Handler::new(
636675
serial_number,
637676
components,
638677
attached_mgs,
639678
incoming_serial_console,
679+
power_state,
640680
log.clone(),
641681
old_rot_state,
642682
update_state,
683+
power_state_changes,
643684
)));
644685
let responses_sent_count = watch::Sender::new(0);
645686
let responses_sent_count_rx = responses_sent_count.subscribe();
@@ -782,7 +823,8 @@ struct Handler {
782823

783824
attached_mgs: AttachedMgsSerialConsole,
784825
incoming_serial_console: HashMap<SpComponent, UnboundedSender<Vec<u8>>>,
785-
power_state: PowerState,
826+
power_state: watch::Sender<GimletPowerState>,
827+
power_state_changes: Arc<AtomicUsize>,
786828
startup_options: StartupOptions,
787829
update_state: SimSpUpdate,
788830
reset_pending: Option<SpComponent>,
@@ -801,14 +843,17 @@ struct Handler {
801843
}
802844

803845
impl Handler {
846+
#[allow(clippy::too_many_arguments)]
804847
fn new(
805848
serial_number: String,
806849
components: Vec<SpComponentConfig>,
807850
attached_mgs: AttachedMgsSerialConsole,
808851
incoming_serial_console: HashMap<SpComponent, UnboundedSender<Vec<u8>>>,
852+
power_state: watch::Sender<GimletPowerState>,
809853
log: Logger,
810854
old_rot_state: bool,
811855
update_state: SimSpUpdate,
856+
power_state_changes: Arc<AtomicUsize>,
812857
) -> Self {
813858
let mut leaked_component_device_strings =
814859
Vec::with_capacity(components.len());
@@ -835,14 +880,15 @@ impl Handler {
835880
serial_number,
836881
attached_mgs,
837882
incoming_serial_console,
838-
power_state: PowerState::A2,
839883
startup_options: StartupOptions::empty(),
840884
update_state,
841885
reset_pending: None,
886+
power_state,
842887
last_request_handled: None,
843888
should_fail_to_respond_signal: None,
844889
old_rot_state,
845890
sp_dumps,
891+
power_state_changes,
846892
}
847893
}
848894

@@ -859,7 +905,7 @@ impl Handler {
859905
model,
860906
revision: 0,
861907
base_mac_address: [0; 6],
862-
power_state: self.power_state,
908+
power_state: (*self.power_state.borrow()).into(),
863909
rot: Ok(rot_state_v2(self.update_state.rot_state())),
864910
}
865911
}
@@ -1195,19 +1241,21 @@ impl SpHandler for Handler {
11951241
}
11961242

11971243
fn power_state(&mut self) -> Result<PowerState, SpError> {
1244+
let power_state = *self.power_state.borrow();
11981245
debug!(
11991246
&self.log, "received power state";
1200-
"power_state" => ?self.power_state,
1247+
"power_state" => ?power_state,
12011248
);
1202-
Ok(self.power_state)
1249+
Ok(power_state.into())
12031250
}
12041251

12051252
fn set_power_state(
12061253
&mut self,
12071254
sender: Sender<Self::VLanId>,
12081255
power_state: PowerState,
12091256
) -> Result<PowerStateTransition, SpError> {
1210-
let transition = if power_state != self.power_state {
1257+
let prev_power_state = *self.power_state.borrow();
1258+
let transition = if power_state != prev_power_state.into() {
12111259
PowerStateTransition::Changed
12121260
} else {
12131261
PowerStateTransition::Unchanged
@@ -1216,11 +1264,38 @@ impl SpHandler for Handler {
12161264
debug!(
12171265
&self.log, "received set power state";
12181266
"sender" => ?sender,
1219-
"prev_power_state" => ?self.power_state,
1267+
"prev_power_state" => ?power_state,
12201268
"power_state" => ?power_state,
12211269
"transition" => ?transition,
12221270
);
1223-
self.power_state = power_state;
1271+
1272+
let new_power_state = match power_state {
1273+
PowerState::A0 => {
1274+
let slot = self
1275+
.update_state
1276+
.component_get_active_slot(SpComponent::HOST_CPU_BOOT_FLASH)
1277+
.expect("can always get active slot for valid component");
1278+
let slot = M2Slot::from_mgs_firmware_slot(slot)
1279+
.expect("sp-sim ensures host slot is always valid");
1280+
GimletPowerState::A0(slot)
1281+
}
1282+
// `A1` is a transitory state that we can't even observe on real
1283+
// devices as of https://github.com/oxidecomputer/hubris/pull/2107.
1284+
// Our tests really care about "host powered on" (A0) or "host
1285+
// powered off" (A2), so just squish the transitory state down to
1286+
// "host powered off".
1287+
PowerState::A1 | PowerState::A2 => GimletPowerState::A2,
1288+
};
1289+
self.power_state.send_modify(|s| {
1290+
*s = new_power_state;
1291+
});
1292+
match transition {
1293+
PowerStateTransition::Changed => {
1294+
self.power_state_changes.fetch_add(1, Ordering::Relaxed);
1295+
}
1296+
PowerStateTransition::Unchanged => (),
1297+
}
1298+
12241299
Ok(transition)
12251300
}
12261301

0 commit comments

Comments
 (0)