Skip to content

Commit 8b13de1

Browse files
committed
Add task-dump endpoints to MGS
This exposes the `faux-mgs dump` command, which shows the number of tasks dump present, and enables downloading a dehydrated dump. Closes #7271
1 parent 7297d76 commit 8b13de1

File tree

14 files changed

+466
-9
lines changed

14 files changed

+466
-9
lines changed

Cargo.lock

Lines changed: 20 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -390,8 +390,8 @@ gateway-client = { path = "clients/gateway-client" }
390390
# is "fine", because SP/MGS communication maintains forwards and backwards
391391
# compatibility, but will mean that faux-mgs might be missing new
392392
# functionality.)
393-
gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "9bbac475dcaac88286c07a20b6bd3e94fc81d7f0", default-features = false, features = ["std"] }
394-
gateway-sp-comms = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "9bbac475dcaac88286c07a20b6bd3e94fc81d7f0" }
393+
gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "97301243f0707708ae9e629e2b4cdea5ae3fd078", default-features = false, features = ["std"] }
394+
gateway-sp-comms = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "97301243f0707708ae9e629e2b4cdea5ae3fd078" }
395395
gateway-test-utils = { path = "gateway-test-utils" }
396396
gateway-types = { path = "gateway-types" }
397397
gethostname = "0.5.0"

dev-tools/omdb/src/bin/omdb/mgs/sensors.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ pub(crate) struct Sensor {
119119
impl Sensor {
120120
fn units(&self) -> &str {
121121
match self.kind {
122-
MeasurementKind::Temperature => "°C",
122+
MeasurementKind::Temperature | MeasurementKind::CpuTctl => "°C",
123123
MeasurementKind::Current | MeasurementKind::InputCurrent => "A",
124124
MeasurementKind::Voltage | MeasurementKind::InputVoltage => "V",
125125
MeasurementKind::Speed => "RPM",
@@ -150,6 +150,7 @@ impl Sensor {
150150
fn to_kind_string(&self) -> &str {
151151
match self.kind {
152152
MeasurementKind::Temperature => "temp",
153+
MeasurementKind::CpuTctl => "tctl",
153154
MeasurementKind::Power => "power",
154155
MeasurementKind::Current => "current",
155156
MeasurementKind::Voltage => "voltage",

gateway-api/src/lib.rs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use gateway_types::{
2020
ignition::{IgnitionCommand, SpIgnitionInfo},
2121
rot::{RotCfpa, RotCfpaSlot, RotCmpa, RotState},
2222
sensor::SpSensorReading,
23+
task_dump::TaskDump,
2324
update::{
2425
HostPhase2Progress, HostPhase2RecoveryImageId, InstallinatorImageId,
2526
SpUpdateStatus,
@@ -306,6 +307,26 @@ pub trait GatewayApi {
306307
params: TypedBody<GetRotBootInfoParams>,
307308
) -> Result<HttpResponseOk<RotState>, HttpError>;
308309

310+
/// Get the number of task dumps present on an SP
311+
#[endpoint {
312+
method = GET,
313+
path = "/sp/{type}/{slot}/task-dump",
314+
}]
315+
async fn sp_host_task_dump_count(
316+
rqctx: RequestContext<Self::Context>,
317+
path: Path<PathSp>,
318+
) -> Result<HttpResponseOk<u32>, HttpError>;
319+
320+
/// Read a single task dump from an SP
321+
#[endpoint {
322+
method = GET,
323+
path = "/sp/{type}/{slot}/task-dump/{task_dump_index}",
324+
}]
325+
async fn sp_host_task_dump_get(
326+
rqctx: RequestContext<Self::Context>,
327+
path: Path<PathSpTaskDumpIndex>,
328+
) -> Result<HttpResponseOk<TaskDump>, HttpError>;
329+
309330
/// List SPs via Ignition
310331
///
311332
/// Retreive information for all SPs via the Ignition controller. This is
@@ -498,6 +519,16 @@ pub struct PathSpComponent {
498519
pub component: String,
499520
}
500521

522+
#[derive(Deserialize, JsonSchema)]
523+
pub struct PathSpTaskDumpIndex {
524+
/// ID for the SP that the gateway service translates into the appropriate
525+
/// port for communicating with the given SP.
526+
#[serde(flatten)]
527+
pub sp: SpIdentifier,
528+
/// The index of the task dump to be read.
529+
pub task_dump_index: u32,
530+
}
531+
501532
#[derive(Deserialize, JsonSchema)]
502533
pub struct ComponentCabooseSlot {
503534
/// The firmware slot to for which we want to request caboose information.

gateway-types/src/component_details.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,7 @@ pub enum MeasurementKind {
374374
InputCurrent,
375375
InputVoltage,
376376
Speed,
377+
CpuTctl,
377378
}
378379

379380
impl From<gateway_messages::measurement::MeasurementKind> for MeasurementKind {
@@ -387,6 +388,7 @@ impl From<gateway_messages::measurement::MeasurementKind> for MeasurementKind {
387388
MeasurementKind::InputCurrent => Self::InputCurrent,
388389
MeasurementKind::InputVoltage => Self::InputVoltage,
389390
MeasurementKind::Speed => Self::Speed,
391+
MeasurementKind::CpuTctl => Self::CpuTctl,
390392
}
391393
}
392394
}

gateway-types/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ pub mod host;
1111
pub mod ignition;
1212
pub mod rot;
1313
pub mod sensor;
14+
pub mod task_dump;
1415
pub mod update;

gateway-types/src/task_dump.rs

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
use schemars::JsonSchema;
6+
use serde::{Deserialize, Serialize};
7+
use std::collections::BTreeMap;
8+
9+
#[derive(
10+
Debug,
11+
Clone,
12+
PartialEq,
13+
Eq,
14+
PartialOrd,
15+
Ord,
16+
Deserialize,
17+
Serialize,
18+
JsonSchema,
19+
)]
20+
pub struct TaskDump {
21+
/// Index of the crashed task.
22+
pub task_index: u16,
23+
/// Timestamp at which the task crash occurred.
24+
pub timestamp: u64,
25+
/// Hex-encoded Hubris archive ID.
26+
pub archive_id: String,
27+
/// `BORD` field from the caboose.
28+
pub bord: String,
29+
/// `GITC` field from the caboose.
30+
pub gitc: String,
31+
/// `VERS` field from the caboose, if present.
32+
pub vers: Option<String>,
33+
/// Base64-encoded raw memory read from the SP.
34+
pub base64_memory: BTreeMap<u32, String>,
35+
}

gateway/src/http_entrypoints.rs

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ use gateway_types::rot::RotCfpaSlot;
4343
use gateway_types::rot::RotCmpa;
4444
use gateway_types::rot::RotState;
4545
use gateway_types::sensor::SpSensorReading;
46+
use gateway_types::task_dump::TaskDump;
4647
use gateway_types::update::HostPhase2Progress;
4748
use gateway_types::update::HostPhase2RecoveryImageId;
4849
use gateway_types::update::InstallinatorImageId;
@@ -655,6 +656,66 @@ impl GatewayApi for GatewayImpl {
655656
apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
656657
}
657658

659+
async fn sp_host_task_dump_count(
660+
rqctx: RequestContext<Self::Context>,
661+
path: Path<PathSp>,
662+
) -> Result<HttpResponseOk<u32>, HttpError> {
663+
let apictx = rqctx.context();
664+
let sp_id = path.into_inner().sp.into();
665+
666+
let handler = async {
667+
let sp = apictx.mgmt_switch.sp(sp_id)?;
668+
let ct = sp.task_dump_count().await.map_err(|err| {
669+
SpCommsError::SpCommunicationFailed { sp: sp_id, err }
670+
})?;
671+
672+
Ok(HttpResponseOk(ct))
673+
};
674+
apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
675+
}
676+
677+
async fn sp_host_task_dump_get(
678+
rqctx: RequestContext<Self::Context>,
679+
path: Path<PathSpTaskDumpIndex>,
680+
) -> Result<HttpResponseOk<TaskDump>, HttpError> {
681+
let apictx = rqctx.context();
682+
let path = path.into_inner();
683+
let task_index = path.task_dump_index;
684+
let sp_id = path.sp.into();
685+
686+
let handler = async {
687+
let sp = apictx.mgmt_switch.sp(sp_id)?;
688+
let raw_dump =
689+
sp.task_dump_read(task_index).await.map_err(|err| {
690+
SpCommsError::SpCommunicationFailed { sp: sp_id, err }
691+
})?;
692+
693+
let archive_id = hex::encode(raw_dump.archive_id);
694+
let base64_memory = raw_dump
695+
.memory
696+
.into_iter()
697+
.map(|(key, mem)| {
698+
let base64_mem =
699+
base64::engine::general_purpose::STANDARD.encode(mem);
700+
(key, base64_mem)
701+
})
702+
.collect();
703+
704+
let dump = TaskDump {
705+
task_index: raw_dump.task_index,
706+
timestamp: raw_dump.timestamp,
707+
archive_id,
708+
bord: raw_dump.bord,
709+
gitc: raw_dump.gitc,
710+
vers: raw_dump.vers,
711+
base64_memory,
712+
};
713+
714+
Ok(HttpResponseOk(dump))
715+
};
716+
apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
717+
}
718+
658719
async fn ignition_list(
659720
rqctx: RequestContext<Self::Context>,
660721
) -> Result<HttpResponseOk<Vec<SpIgnitionInfo>>, HttpError> {

gateway/src/metrics.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -804,6 +804,7 @@ impl SpPoller {
804804
MeasurementKind::InputCurrent => "input_current",
805805
MeasurementKind::InputVoltage => "input_voltage",
806806
MeasurementKind::Speed => "fan_speed",
807+
MeasurementKind::CpuTctl => "cpu_tctl",
807808
};
808809
let error = match error {
809810
MeasurementError::InvalidSensor => "invalid_sensor",
@@ -858,6 +859,10 @@ impl SpPoller {
858859
&metric::AmdCpuTctl { sensor, datum },
859860
)
860861
}
862+
(Ok(datum), MeasurementKind::CpuTctl) => Sample::new(
863+
target,
864+
&metric::AmdCpuTctl { sensor, datum },
865+
),
861866
// Other measurements with the "temperature" measurement
862867
// kind are physical temperatures that actually exist in
863868
// reality (and are always in Celsius).
@@ -873,6 +878,12 @@ impl SpPoller {
873878
&metric::AmdCpuTctl { sensor, datum: 0.0 },
874879
)
875880
}
881+
(Err(_), MeasurementKind::CpuTctl) => {
882+
Sample::new_missing(
883+
target,
884+
&metric::AmdCpuTctl { sensor, datum: 0.0 },
885+
)
886+
}
876887
(Err(_), MeasurementKind::Temperature) => {
877888
Sample::new_missing(
878889
target,
@@ -1205,5 +1216,8 @@ fn comms_error_str(error: CommunicationError) -> &'static str {
12051216
CommunicationError::BadTrailingDataSize { .. } => {
12061217
"bad_trailing_data_size"
12071218
}
1219+
CommunicationError::BadDecompressionSize { .. } => {
1220+
"bad_decompression_size"
1221+
}
12081222
}
12091223
}

nexus/tests/integration_tests/metrics.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -837,6 +837,7 @@ async fn test_mgs_metrics(
837837
temp += 1;
838838
}
839839
}
840+
Kind::CpuTctl => cpu_tctl += 1,
840841
Kind::Current => current += 1,
841842
Kind::Voltage => voltage += 1,
842843
Kind::InputVoltage => input_voltage += 1,

0 commit comments

Comments
 (0)