Skip to content

Commit 5038ad8

Browse files
committed
ghostdog: Add AMD GPU driver validation
Add match-driver support for AMD GPUs to validate that the amdgpu kernel module and ROCm KFD driver are properly loaded before starting the device plugin service. Signed-off-by: Gaurav Sharma <[email protected]>
1 parent 27e1d76 commit 5038ad8

File tree

5 files changed

+112
-2
lines changed

5 files changed

+112
-2
lines changed

packages/amd-k8s-device-plugin/amd-k8s-device-plugin.service

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,15 @@
22
Description=Start AMD kubernetes device plugin
33
After=kubelet.service
44
Wants=kubelet.service
5+
ConditionPathExists=/usr/bin/ghostdog
56

67
[Service]
8+
# Verify AMD GPU is detected
9+
ExecStartPre=/usr/bin/ghostdog amd-gpu-present
10+
# Verify amdgpu driver is loaded
11+
ExecStartPre=/usr/bin/ghostdog match-driver amd rocm
12+
# Ensure that the kubelet device plugin socket exists before we start
13+
# A brief sleep is needed to avoid the `test` failing its first check
714
ExecStartPre=/usr/bin/sleep 0.1
815
ExecStartPre=/usr/bin/test -S /var/lib/kubelet/device-plugins/kubelet.sock
916
ExecStart=/usr/bin/amd-device-plugin -logtostderr=true -stderrthreshold=INFO -v=5

sources/ghostdog/src/error.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,14 @@ pub(super) enum Error {
3232
CheckEfaFailure { source: pciclient::PciClientError },
3333
#[snafu(display("Failed to check if Neuron device is attached: {}", source))]
3434
CheckNeuronFailure { source: pciclient::PciClientError },
35+
#[snafu(display("Failed to check if AMD GPU device is attached: {}", source))]
36+
CheckAmdGpuFailure { source: pciclient::PciClientError },
3537
#[snafu(display("Did not detect EFA"))]
3638
NoEfaPresent,
3739
#[snafu(display("Did not detect Neuron"))]
3840
NoNeuronPresent,
41+
#[snafu(display("Did not detect AMD GPU"))]
42+
NoAmdGpuPresent,
3943
#[snafu(display("'{}' has no parent directory", path.display()))]
4044
NoParentDirectory { path: std::path::PathBuf },
4145
#[snafu(display("Failed to open '{}': {}", path.display(), source))]
@@ -78,6 +82,8 @@ pub(super) enum Error {
7882
NoInf1Present,
7983
#[snafu(display("Did not detect inf2+ hardware"))]
8084
NoInf2Present,
85+
#[snafu(display("AMD GPU driver not loaded: {}", reason))]
86+
AmdDriverNotLoaded { reason: String },
8187
}
8288

8389
pub(crate) type Result<T> = std::result::Result<T, Error>;

sources/ghostdog/src/main.rs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ enum SubCommand {
5858
EbsDeviceName(EbsDeviceNameArgs),
5959
EfaPresent(EfaPresentArgs),
6060
NeuronPresent(NeuronPresentArgs),
61+
AmdGpuPresent(AmdGpuPresentArgs),
6162
MatchDriver(MatchDriverArgs),
6263
MatchNvidiaDriver(MatchNvidiaDriverArgs),
6364
WriteInfinibandGuid(WriteInfinibandGuidArgs),
@@ -73,6 +74,11 @@ struct EfaPresentArgs {}
7374
/// Detect if Neuron devices are attached.
7475
struct NeuronPresentArgs {}
7576

77+
#[derive(FromArgs, PartialEq, Debug)]
78+
#[argh(subcommand, name = "amd-gpu-present")]
79+
/// Detect if AMD GPU devices are attached.
80+
struct AmdGpuPresentArgs {}
81+
7682
#[derive(FromArgs, PartialEq, Debug)]
7783
#[argh(subcommand, name = "scan")]
7884
/// Scan a device to see if it is an ephemeral disk.
@@ -163,6 +169,9 @@ fn main() -> Result<()> {
163169
SubCommand::NeuronPresent(_) => {
164170
is_neuron_attached()?;
165171
}
172+
SubCommand::AmdGpuPresent(_) => {
173+
is_amd_gpu_attached()?;
174+
}
166175
SubCommand::MatchNvidiaDriver(driver) => {
167176
let driver_name = driver.driver_name;
168177
nvidia_driver_supported(&driver_name)?;
@@ -173,6 +182,7 @@ fn main() -> Result<()> {
173182
match driver_name.as_str() {
174183
"nvidia" => nvidia_driver_supported(&flavor_name)?,
175184
"neuron" => match_neuron_driver(&flavor_name)?,
185+
"amd" => match_amd_driver(&flavor_name)?,
176186
_ => {
177187
return Err(error::Error::UnsupportedDriver {
178188
driver: driver_name.to_string(),
@@ -203,6 +213,14 @@ fn is_neuron_attached() -> Result<()> {
203213
}
204214
}
205215

216+
fn is_amd_gpu_attached() -> Result<()> {
217+
if pciclient::is_amd_gpu_attached().context(error::CheckAmdGpuFailureSnafu)? {
218+
Ok(())
219+
} else {
220+
Err(error::Error::NoAmdGpuPresent)
221+
}
222+
}
223+
206224
// Returns true if this is an inf1 instance
207225
fn is_inf1_instance() -> Result<()> {
208226
if pciclient::is_inf1_instance().context(error::CheckInf1FailureSnafu)? {
@@ -412,6 +430,35 @@ fn match_neuron_driver(driver_flavor: &str) -> Result<()> {
412430
}
413431
}
414432

433+
fn match_amd_driver(driver_flavor: &str) -> Result<()> {
434+
match driver_flavor {
435+
"rocm" => amd_driver_loaded(),
436+
_ => Err(error::Error::UnsupportedDriverFlavor {
437+
driver: "amd".to_string(),
438+
flavor: driver_flavor.to_string(),
439+
}),
440+
}
441+
}
442+
443+
fn amd_driver_loaded() -> Result<()> {
444+
let kfd_path = "/sys/class/kfd";
445+
let amdgpu_module_path = "/sys/module/amdgpu";
446+
447+
if !std::path::Path::new(kfd_path).exists() {
448+
return Err(error::Error::AmdDriverNotLoaded {
449+
reason: format!("KFD device path {} not found - ROCm kernel driver not loaded", kfd_path),
450+
});
451+
}
452+
453+
if !std::path::Path::new(amdgpu_module_path).exists() {
454+
return Err(error::Error::AmdDriverNotLoaded {
455+
reason: format!("amdgpu kernel module not loaded ({} not found)", amdgpu_module_path),
456+
});
457+
}
458+
459+
Ok(())
460+
}
461+
415462
// Known system partition types for Bottlerocket.
416463
lazy_static! {
417464
static ref SYSTEM_PARTITION_TYPES: HashSet<[u8; 16]> = [

sources/pciclient/src/lib.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ mod private;
99

1010
use private::{
1111
call_list_devices, check_efa_attachment, check_inf1_attachment, check_inf2_attachment,
12-
check_neuron_attachment, PciClient,
12+
check_neuron_attachment, check_amd_gpu_attachment, PciClient,
1313
};
1414

1515
use bon::Builder;
@@ -123,6 +123,11 @@ pub fn is_neuron_attached() -> Result<bool> {
123123
check_neuron_attachment(PciClient {})
124124
}
125125

126+
/// Call `lspci` and check if there is any AMD GPU device attached.
127+
pub fn is_amd_gpu_attached() -> Result<bool> {
128+
check_amd_gpu_attachment(PciClient {})
129+
}
130+
126131
/// Call `lspci` and check if there are inf1 devices attached
127132
pub fn is_inf1_instance() -> Result<bool> {
128133
check_inf1_attachment(PciClient {})

sources/pciclient/src/private.rs

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use crate::{
1414

1515
const AMAZON_VENDOR_CODE: &str = "1d0f";
1616
const EFA_KEYWORD: &str = "efa";
17+
const AMD_VENDOR_CODE: &str = "1002";
1718

1819
// Neuron devices specific to inf1 instance types
1920
lazy_static! {
@@ -34,6 +35,12 @@ lazy_static! {
3435
};
3536
}
3637

38+
lazy_static! {
39+
static ref AMD_GPU_DEVICES: HashSet<&'static str> = hashset! {
40+
"75a3" // AMD Instinct MI355X
41+
};
42+
}
43+
3744
const LSPCI_PATH: &str = "/usr/bin/lspci";
3845

3946
pub(crate) trait CommandExecutor {
@@ -183,6 +190,19 @@ pub(crate) fn check_neuron_attachment<T: CommandExecutor>(command_executor: T) -
183190
}))
184191
}
185192

193+
/// Call `lspci` and check if there is any AMD GPU device attached.
194+
/// Internal usage, adding command_executor as parameter allows us to better unit test.
195+
/// For external usage, check [`crate::is_amd_gpu_attached`].
196+
pub(crate) fn check_amd_gpu_attachment<T: CommandExecutor>(command_executor: T) -> Result<bool> {
197+
let list_devices_param = ListDevicesParam::builder()
198+
.vendor(AMD_VENDOR_CODE.to_string())
199+
.build();
200+
let list_device_results = call_list_devices(command_executor, list_devices_param)?;
201+
Ok(list_device_results
202+
.iter()
203+
.any(|device_info| AMD_GPU_DEVICES.contains(&device_info.device.as_str())))
204+
}
205+
186206
/// Call `lspci` and check if there is any inf1-specific Neuron device attached.
187207
/// Internal usage, adding command_executor as parameter allows us to better unit test.
188208
/// For external usage, check [`crate::is_inf1_instance`].
@@ -229,7 +249,7 @@ mod test {
229249

230250
use super::{
231251
call_list_devices, check_efa_attachment, check_inf1_attachment, check_neuron_attachment,
232-
parse_list_devices_output, CommandExecutor,
252+
check_amd_gpu_attachment, parse_list_devices_output, CommandExecutor,
233253
};
234254

235255
struct MockPciClient {
@@ -436,6 +456,31 @@ mod test {
436456
assert!(check_inf1_attachment_result.unwrap());
437457
}
438458

459+
#[test]
460+
fn test_is_amd_gpu_attached() {
461+
let mock_pci_client = MockPciClient {
462+
// AMD Instinct MI355X device has vendor 1002 for AMD, device code 75a3.
463+
output: vec![
464+
r#"00:1e.0 "0300" "1002" "75a3" -p00 "1002" "0123""#.to_string(),
465+
r#"00:1f.0 "0302" "10de" "1eb8" -ra1 -p00 "10de" "12a2""#.to_string(),
466+
],
467+
};
468+
let check_amd_gpu_attachment_result = check_amd_gpu_attachment(mock_pci_client);
469+
assert!(check_amd_gpu_attachment_result.is_ok());
470+
assert!(check_amd_gpu_attachment_result.unwrap());
471+
}
472+
473+
#[test]
474+
fn test_is_amd_gpu_attached_negative_case() {
475+
let mock_pci_client = MockPciClient {
476+
// Below is an actual output from lspci for ena device (not AMD GPU).
477+
output: vec![r#"00:06.0 "0200" "1d0f" "ec20" -p00 "1d0f" "ec20""#.to_string()],
478+
};
479+
let check_amd_gpu_attachment_result = check_amd_gpu_attachment(mock_pci_client);
480+
assert!(check_amd_gpu_attachment_result.is_ok());
481+
assert!(!check_amd_gpu_attachment_result.unwrap());
482+
}
483+
439484
#[test]
440485
fn test_is_inf1_attached_negative_case() {
441486
let mock_pci_client = MockPciClient {

0 commit comments

Comments
 (0)