Skip to content

Commit bc30ba8

Browse files
authored
Add disk-related commands to support bundles (#7876)
Support bundles will eventually replace the `check-health.sh` script we use today to validate rack health. We have several network-related commands, but none that check sled storage. Execute `nvmeadm(8)`, `zfs(8)`, and `zpool(8)` as part of a bundle to capture a basic disk health check.
1 parent 0dad016 commit bc30ba8

File tree

8 files changed

+213
-0
lines changed

8 files changed

+213
-0
lines changed

nexus/src/app/background/tasks/support_bundle_collector.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,12 @@ impl BundleCollection<'_> {
624624
sled_client.support_ipadm_info(),
625625
)
626626
.boxed(),
627+
save_diag_cmd_output_or_error(
628+
&sled_path,
629+
"nvmeadm",
630+
sled_client.support_nvmeadm_info(),
631+
)
632+
.boxed(),
627633
save_diag_cmd_output_or_error(
628634
&sled_path,
629635
"pargs",
@@ -642,6 +648,18 @@ impl BundleCollection<'_> {
642648
sled_client.support_pstack_info(),
643649
)
644650
.boxed(),
651+
save_diag_cmd_output_or_error(
652+
&sled_path,
653+
"zfs",
654+
sled_client.support_zfs_info(),
655+
)
656+
.boxed(),
657+
save_diag_cmd_output_or_error(
658+
&sled_path,
659+
"zpool",
660+
sled_client.support_zpool_info(),
661+
)
662+
.boxed(),
645663
])
646664
// Currently we execute up to 10 commands concurrently which
647665
// might be doing their own concurrent work, for example

openapi/sled-agent.json

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,29 @@
703703
}
704704
}
705705
},
706+
"/support/nvmeadm-info": {
707+
"get": {
708+
"operationId": "support_nvmeadm_info",
709+
"responses": {
710+
"200": {
711+
"description": "successful operation",
712+
"content": {
713+
"application/json": {
714+
"schema": {
715+
"$ref": "#/components/schemas/SledDiagnosticsQueryOutput"
716+
}
717+
}
718+
}
719+
},
720+
"4XX": {
721+
"$ref": "#/components/responses/Error"
722+
},
723+
"5XX": {
724+
"$ref": "#/components/responses/Error"
725+
}
726+
}
727+
}
728+
},
706729
"/support/pargs-info": {
707730
"get": {
708731
"operationId": "support_pargs_info",
@@ -784,6 +807,29 @@
784807
}
785808
}
786809
},
810+
"/support/zfs-info": {
811+
"get": {
812+
"operationId": "support_zfs_info",
813+
"responses": {
814+
"200": {
815+
"description": "successful operation",
816+
"content": {
817+
"application/json": {
818+
"schema": {
819+
"$ref": "#/components/schemas/SledDiagnosticsQueryOutput"
820+
}
821+
}
822+
}
823+
},
824+
"4XX": {
825+
"$ref": "#/components/responses/Error"
826+
},
827+
"5XX": {
828+
"$ref": "#/components/responses/Error"
829+
}
830+
}
831+
}
832+
},
787833
"/support/zoneadm-info": {
788834
"get": {
789835
"operationId": "support_zoneadm_info",
@@ -807,6 +853,29 @@
807853
}
808854
}
809855
},
856+
"/support/zpool-info": {
857+
"get": {
858+
"operationId": "support_zpool_info",
859+
"responses": {
860+
"200": {
861+
"description": "successful operation",
862+
"content": {
863+
"application/json": {
864+
"schema": {
865+
"$ref": "#/components/schemas/SledDiagnosticsQueryOutput"
866+
}
867+
}
868+
}
869+
},
870+
"4XX": {
871+
"$ref": "#/components/responses/Error"
872+
},
873+
"5XX": {
874+
"$ref": "#/components/responses/Error"
875+
}
876+
}
877+
}
878+
},
810879
"/support-bundles/{zpool_id}/{dataset_id}": {
811880
"get": {
812881
"summary": "List all support bundles within a particular dataset",

sled-agent/api/src/lib.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,14 @@ pub trait SledAgentApi {
644644
request_context: RequestContext<Self::Context>,
645645
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>;
646646

647+
#[endpoint {
648+
method = GET,
649+
path = "/support/nvmeadm-info",
650+
}]
651+
async fn support_nvmeadm_info(
652+
request_context: RequestContext<Self::Context>,
653+
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError>;
654+
647655
#[endpoint {
648656
method = GET,
649657
path = "/support/pargs-info",
@@ -667,6 +675,22 @@ pub trait SledAgentApi {
667675
async fn support_pfiles_info(
668676
request_context: RequestContext<Self::Context>,
669677
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>;
678+
679+
#[endpoint {
680+
method = GET,
681+
path = "/support/zfs-info",
682+
}]
683+
async fn support_zfs_info(
684+
request_context: RequestContext<Self::Context>,
685+
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError>;
686+
687+
#[endpoint {
688+
method = GET,
689+
path = "/support/zpool-info",
690+
}]
691+
async fn support_zpool_info(
692+
request_context: RequestContext<Self::Context>,
693+
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError>;
670694
}
671695

672696
#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)]

sled-agent/src/http_entrypoints.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -991,6 +991,14 @@ impl SledAgentApi for SledAgentImpl {
991991
))
992992
}
993993

994+
async fn support_nvmeadm_info(
995+
request_context: RequestContext<Self::Context>,
996+
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError> {
997+
let sa = request_context.context();
998+
let res = sa.support_nvmeadm_info().await;
999+
Ok(HttpResponseOk(res.get_output()))
1000+
}
1001+
9941002
async fn support_pargs_info(
9951003
request_context: RequestContext<Self::Context>,
9961004
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>
@@ -1032,4 +1040,20 @@ impl SledAgentApi for SledAgentImpl {
10321040
.collect::<Vec<_>>(),
10331041
))
10341042
}
1043+
1044+
async fn support_zfs_info(
1045+
request_context: RequestContext<Self::Context>,
1046+
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError> {
1047+
let sa = request_context.context();
1048+
let res = sa.support_zfs_info().await;
1049+
Ok(HttpResponseOk(res.get_output()))
1050+
}
1051+
1052+
async fn support_zpool_info(
1053+
request_context: RequestContext<Self::Context>,
1054+
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError> {
1055+
let sa = request_context.context();
1056+
let res = sa.support_zpool_info().await;
1057+
Ok(HttpResponseOk(res.get_output()))
1058+
}
10351059
}

sled-agent/src/sim/http_entrypoints.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -719,6 +719,12 @@ impl SledAgentApi for SledAgentSimImpl {
719719
method_unimplemented()
720720
}
721721

722+
async fn support_nvmeadm_info(
723+
_request_context: RequestContext<Self::Context>,
724+
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError> {
725+
method_unimplemented()
726+
}
727+
722728
async fn support_pargs_info(
723729
_request_context: RequestContext<Self::Context>,
724730
) -> Result<HttpResponseOk<Vec<SledDiagnosticsQueryOutput>>, HttpError>
@@ -739,6 +745,18 @@ impl SledAgentApi for SledAgentSimImpl {
739745
{
740746
method_unimplemented()
741747
}
748+
749+
async fn support_zfs_info(
750+
_request_context: RequestContext<Self::Context>,
751+
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError> {
752+
method_unimplemented()
753+
}
754+
755+
async fn support_zpool_info(
756+
_request_context: RequestContext<Self::Context>,
757+
) -> Result<HttpResponseOk<SledDiagnosticsQueryOutput>, HttpError> {
758+
method_unimplemented()
759+
}
742760
}
743761

744762
fn method_unimplemented<T>() -> Result<T, HttpError> {

sled-agent/src/sled_agent.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1417,6 +1417,12 @@ impl SledAgent {
14171417
sled_diagnostics::dladm_info().await
14181418
}
14191419

1420+
pub(crate) async fn support_nvmeadm_info(
1421+
&self,
1422+
) -> Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError> {
1423+
sled_diagnostics::nvmeadm_info().await
1424+
}
1425+
14201426
pub(crate) async fn support_pargs_info(
14211427
&self,
14221428
) -> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
@@ -1434,6 +1440,18 @@ impl SledAgent {
14341440
) -> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
14351441
sled_diagnostics::pfiles_oxide_processes(&self.log).await
14361442
}
1443+
1444+
pub(crate) async fn support_zfs_info(
1445+
&self,
1446+
) -> Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError> {
1447+
sled_diagnostics::zfs_info().await
1448+
}
1449+
1450+
pub(crate) async fn support_zpool_info(
1451+
&self,
1452+
) -> Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError> {
1453+
sled_diagnostics::zpool_info().await
1454+
}
14371455
}
14381456

14391457
#[derive(From, thiserror::Error, Debug)]

sled-diagnostics/src/lib.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ pub async fn dladm_info()
6161
.await
6262
}
6363

64+
pub async fn nvmeadm_info()
65+
-> Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError> {
66+
execute_command_with_timeout(nvmeadm_list(), DEFAULT_TIMEOUT).await
67+
}
68+
6469
pub async fn pargs_oxide_processes(
6570
log: &Logger,
6671
) -> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
@@ -123,3 +128,15 @@ pub async fn pfiles_oxide_processes(
123128
.collect::<Vec<Result<_, _>>>()
124129
.await
125130
}
131+
132+
/// Retrieve various `zfs` command output for the system.
133+
pub async fn zfs_info()
134+
-> Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError> {
135+
execute_command_with_timeout(zfs_list(), DEFAULT_TIMEOUT).await
136+
}
137+
138+
/// Retrieve various `zpool` command output for the system.
139+
pub async fn zpool_info()
140+
-> Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError> {
141+
execute_command_with_timeout(zpool_status(), DEFAULT_TIMEOUT).await
142+
}

sled-diagnostics/src/queries.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,14 @@ use crate::contract_stub::ContractError;
2222

2323
const DLADM: &str = "/usr/sbin/dladm";
2424
const IPADM: &str = "/usr/sbin/ipadm";
25+
const NVMEADM: &str = "/usr/sbin/nvmeadm";
2526
const PFEXEC: &str = "/usr/bin/pfexec";
2627
const PFILES: &str = "/usr/bin/pfiles";
2728
const PSTACK: &str = "/usr/bin/pstack";
2829
const PARGS: &str = "/usr/bin/pargs";
30+
const ZFS: &str = "/usr/sbin/zfs";
2931
const ZONEADM: &str = "/usr/sbin/zoneadm";
32+
const ZPOOL: &str = "/usr/sbin/zpool";
3033

3134
pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
3235

@@ -236,6 +239,12 @@ pub fn dladm_show_linkprop() -> Command {
236239
cmd
237240
}
238241

242+
pub fn nvmeadm_list() -> Command {
243+
let mut cmd = std::process::Command::new(PFEXEC);
244+
cmd.env_clear().arg(NVMEADM).arg("list");
245+
cmd
246+
}
247+
239248
pub fn pargs_process(pid: i32) -> Command {
240249
let mut cmd = std::process::Command::new(PFEXEC);
241250
cmd.env_clear().arg(PARGS).arg("-ae").arg(pid.to_string());
@@ -254,6 +263,22 @@ pub fn pfiles_process(pid: i32) -> Command {
254263
cmd
255264
}
256265

266+
pub fn zfs_list() -> Command {
267+
let mut cmd = std::process::Command::new(PFEXEC);
268+
cmd.env_clear()
269+
.arg(ZFS)
270+
.arg("list")
271+
.arg("-o")
272+
.arg("name,used,avail,quota,reservation,mountpoint,mounted");
273+
cmd
274+
}
275+
276+
pub fn zpool_status() -> Command {
277+
let mut cmd = std::process::Command::new(PFEXEC);
278+
cmd.env_clear().arg(ZPOOL).arg("status");
279+
cmd
280+
}
281+
257282
#[cfg(test)]
258283
mod test {
259284
use super::*;

0 commit comments

Comments
 (0)