Skip to content

Commit a64d69f

Browse files
committed
Add AMD GPU detection to ghostdog
Signed-off-by: Gaurav Sharma <[email protected]>
1 parent 27e1d76 commit a64d69f

File tree

4 files changed

+73
-2
lines changed

4 files changed

+73
-2
lines changed

sources/ghostdog/src/error.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,14 @@ pub(super) enum Error {
3232
CheckEfaFailure { source: pciclient::PciClientError },
3333
#[snafu(display("Failed to check if Neuron device is attached: {}", source))]
3434
CheckNeuronFailure { source: pciclient::PciClientError },
35+
#[snafu(display("Failed to check if AMD GPU device is attached: {}", source))]
36+
CheckAmdGpuFailure { source: pciclient::PciClientError },
3537
#[snafu(display("Did not detect EFA"))]
3638
NoEfaPresent,
3739
#[snafu(display("Did not detect Neuron"))]
3840
NoNeuronPresent,
41+
#[snafu(display("Did not detect AMD GPU"))]
42+
NoAmdGpuPresent,
3943
#[snafu(display("'{}' has no parent directory", path.display()))]
4044
NoParentDirectory { path: std::path::PathBuf },
4145
#[snafu(display("Failed to open '{}': {}", path.display(), source))]

sources/ghostdog/src/main.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ enum SubCommand {
5858
EbsDeviceName(EbsDeviceNameArgs),
5959
EfaPresent(EfaPresentArgs),
6060
NeuronPresent(NeuronPresentArgs),
61+
AmdGpuPresent(AmdGpuPresentArgs),
6162
MatchDriver(MatchDriverArgs),
6263
MatchNvidiaDriver(MatchNvidiaDriverArgs),
6364
WriteInfinibandGuid(WriteInfinibandGuidArgs),
@@ -73,6 +74,11 @@ struct EfaPresentArgs {}
7374
/// Detect if Neuron devices are attached.
7475
struct NeuronPresentArgs {}
7576

77+
#[derive(FromArgs, PartialEq, Debug)]
78+
#[argh(subcommand, name = "amd-gpu-present")]
79+
/// Detect if AMD GPU devices are attached.
80+
struct AmdGpuPresentArgs {}
81+
7682
#[derive(FromArgs, PartialEq, Debug)]
7783
#[argh(subcommand, name = "scan")]
7884
/// Scan a device to see if it is an ephemeral disk.
@@ -163,6 +169,9 @@ fn main() -> Result<()> {
163169
SubCommand::NeuronPresent(_) => {
164170
is_neuron_attached()?;
165171
}
172+
SubCommand::AmdGpuPresent(_) => {
173+
is_amd_gpu_attached()?;
174+
}
166175
SubCommand::MatchNvidiaDriver(driver) => {
167176
let driver_name = driver.driver_name;
168177
nvidia_driver_supported(&driver_name)?;
@@ -203,6 +212,14 @@ fn is_neuron_attached() -> Result<()> {
203212
}
204213
}
205214

215+
fn is_amd_gpu_attached() -> Result<()> {
216+
if pciclient::is_amd_gpu_attached().context(error::CheckAmdGpuFailureSnafu)? {
217+
Ok(())
218+
} else {
219+
Err(error::Error::NoAmdGpuPresent)
220+
}
221+
}
222+
206223
// Returns true if this is an inf1 instance
207224
fn is_inf1_instance() -> Result<()> {
208225
if pciclient::is_inf1_instance().context(error::CheckInf1FailureSnafu)? {

sources/pciclient/src/lib.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ mod private;
99

1010
use private::{
1111
call_list_devices, check_efa_attachment, check_inf1_attachment, check_inf2_attachment,
12-
check_neuron_attachment, PciClient,
12+
check_neuron_attachment, check_amd_gpu_attachment, PciClient,
1313
};
1414

1515
use bon::Builder;
@@ -123,6 +123,11 @@ pub fn is_neuron_attached() -> Result<bool> {
123123
check_neuron_attachment(PciClient {})
124124
}
125125

126+
/// Call `lspci` and check if there is any AMD GPU device attached.
127+
pub fn is_amd_gpu_attached() -> Result<bool> {
128+
check_amd_gpu_attachment(PciClient {})
129+
}
130+
126131
/// Call `lspci` and check if there are inf1 devices attached
127132
pub fn is_inf1_instance() -> Result<bool> {
128133
check_inf1_attachment(PciClient {})

sources/pciclient/src/private.rs

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use crate::{
1414

1515
const AMAZON_VENDOR_CODE: &str = "1d0f";
1616
const EFA_KEYWORD: &str = "efa";
17+
const AMD_VENDOR_CODE: &str = "1002";
1718

1819
// Neuron devices specific to inf1 instance types
1920
lazy_static! {
@@ -34,6 +35,12 @@ lazy_static! {
3435
};
3536
}
3637

38+
lazy_static! {
39+
static ref AMD_GPU_DEVICES: HashSet<&'static str> = hashset! {
40+
"75a3" // AMD Instinct MI355X
41+
};
42+
}
43+
3744
const LSPCI_PATH: &str = "/usr/bin/lspci";
3845

3946
pub(crate) trait CommandExecutor {
@@ -183,6 +190,19 @@ pub(crate) fn check_neuron_attachment<T: CommandExecutor>(command_executor: T) -
183190
}))
184191
}
185192

193+
/// Call `lspci` and check if there is any AMD GPU device attached.
194+
/// Internal usage, adding command_executor as parameter allows us to better unit test.
195+
/// For external usage, check [`crate::is_amd_gpu_attached`].
196+
pub(crate) fn check_amd_gpu_attachment<T: CommandExecutor>(command_executor: T) -> Result<bool> {
197+
let list_devices_param = ListDevicesParam::builder()
198+
.vendor(AMD_VENDOR_CODE.to_string())
199+
.build();
200+
let list_device_results = call_list_devices(command_executor, list_devices_param)?;
201+
Ok(list_device_results
202+
.iter()
203+
.any(|device_info| AMD_GPU_DEVICES.contains(&device_info.device.as_str())))
204+
}
205+
186206
/// Call `lspci` and check if there is any inf1-specific Neuron device attached.
187207
/// Internal usage, adding command_executor as parameter allows us to better unit test.
188208
/// For external usage, check [`crate::is_inf1_instance`].
@@ -229,7 +249,7 @@ mod test {
229249

230250
use super::{
231251
call_list_devices, check_efa_attachment, check_inf1_attachment, check_neuron_attachment,
232-
parse_list_devices_output, CommandExecutor,
252+
check_amd_gpu_attachment, parse_list_devices_output, CommandExecutor,
233253
};
234254

235255
struct MockPciClient {
@@ -436,6 +456,31 @@ mod test {
436456
assert!(check_inf1_attachment_result.unwrap());
437457
}
438458

459+
#[test]
460+
fn test_is_amd_gpu_attached() {
461+
let mock_pci_client = MockPciClient {
462+
// AMD Instinct MI355X device has vendor 1002 for AMD, device code 75a3.
463+
output: vec![
464+
r#"00:1e.0 "0300" "1002" "75a3" -p00 "1002" "0123""#.to_string(),
465+
r#"00:1f.0 "0302" "10de" "1eb8" -ra1 -p00 "10de" "12a2""#.to_string(),
466+
],
467+
};
468+
let check_amd_gpu_attachment_result = check_amd_gpu_attachment(mock_pci_client);
469+
assert!(check_amd_gpu_attachment_result.is_ok());
470+
assert!(check_amd_gpu_attachment_result.unwrap());
471+
}
472+
473+
#[test]
474+
fn test_is_amd_gpu_attached_negative_case() {
475+
let mock_pci_client = MockPciClient {
476+
// Below is an actual output from lspci for ena device (not AMD GPU).
477+
output: vec![r#"00:06.0 "0200" "1d0f" "ec20" -p00 "1d0f" "ec20""#.to_string()],
478+
};
479+
let check_amd_gpu_attachment_result = check_amd_gpu_attachment(mock_pci_client);
480+
assert!(check_amd_gpu_attachment_result.is_ok());
481+
assert!(!check_amd_gpu_attachment_result.unwrap());
482+
}
483+
439484
#[test]
440485
fn test_is_inf1_attached_negative_case() {
441486
let mock_pci_client = MockPciClient {

0 commit comments

Comments
 (0)