diff --git a/Cargo.lock b/Cargo.lock index 57b1b5c8a..27a0ca7bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -136,6 +136,7 @@ dependencies = [ "libcrypto", "libcryptsetup", "libdevmapper", + "libdrm", "libelf", "libexpat", "libffi", @@ -188,6 +189,7 @@ dependencies = [ "rdma-core", "readline", "release", + "rocm-k8s-device-plugin", "rottweiler", "runc", "selinux-policy", @@ -633,6 +635,13 @@ dependencies = [ "util-linux", ] +[[package]] +name = "libdrm" +version = "0.1.0" +dependencies = [ + "glibc", +] + [[package]] name = "libelf" version = "0.1.0" @@ -1047,6 +1056,15 @@ dependencies = [ name = "release" version = "0.0.0" +[[package]] +name = "rocm-k8s-device-plugin" +version = "0.1.0" +dependencies = [ + "glibc", + "hwloc", + "libdrm", +] + [[package]] name = "rottweiler" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 294707663..76598a22f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,6 +66,7 @@ members = [ "packages/libcap", "packages/libcrypto", "packages/libdevmapper", + "packages/libdrm", "packages/libelf", "packages/libexpat", "packages/libffi", @@ -117,6 +118,7 @@ members = [ "packages/rdma-core", "packages/readline", "packages/release", + "packages/rocm-k8s-device-plugin", "packages/runc", "packages/selinux-policy", "packages/soci-snapshotter", diff --git a/kits/bottlerocket-core-kit/Cargo.toml b/kits/bottlerocket-core-kit/Cargo.toml index 1605a115a..167a6a241 100644 --- a/kits/bottlerocket-core-kit/Cargo.toml +++ b/kits/bottlerocket-core-kit/Cargo.toml @@ -76,6 +76,7 @@ libcap = { path = "../../packages/libcap" } libcrypto = { path = "../../packages/libcrypto" } libcryptsetup = { path = "../../packages/libcryptsetup" } libdevmapper = { path = "../../packages/libdevmapper" } +libdrm = { path = "../../packages/libdrm" } libelf = { path = "../../packages/libelf" } libexpat = { path = "../../packages/libexpat" } libffi = { path = "../../packages/libffi" } @@ -128,6 +129,7 @@ procps = { path = "../../packages/procps" } rdma-core = { path = "../../packages/rdma-core" } readline = { path = "../../packages/readline" } release = { path = "../../packages/release" } +rocm-k8s-device-plugin = { path = "../../packages/rocm-k8s-device-plugin" } rottweiler = { path = "../../packages/rottweiler" } runc = { path = "../../packages/runc" } selinux-policy = { path = "../../packages/selinux-policy" } diff --git a/packages/libdrm/Cargo.toml b/packages/libdrm/Cargo.toml new file mode 100644 index 000000000..d96d6f895 --- /dev/null +++ b/packages/libdrm/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "libdrm" +version = "0.1.0" +edition = "2021" +publish = false +build = "../build.rs" + +[lib] +path = "../packages.rs" + +[package.metadata.build-package] +releases-url = "https://dri.freedesktop.org/libdrm/" + +[[package.metadata.build-package.external-files]] +url = "https://dri.freedesktop.org/libdrm/libdrm-2.4.128.tar.xz" +sha512 = "b80e6be1c9d0427e1c3ffd018213d7230333f037498cf98819a8a6c50d923ad3472002044e010ca9dc646ef79dbca241bd47eaa992014cb7063b31cdb84037c7" +force-upstream = true + +[[package.metadata.build-package.external-files]] +url = "https://dri.freedesktop.org/libdrm/libdrm-2.4.128.tar.xz.sig" +sha512 = "985bb55a51d04cedde1368b9593cbb67a6ed0779f67aa3f8fdefb296be046ad7de22f9715bec4f8c96bf27de592810a4d686d0de13009ef11b5d1c1b7ac103cd" +force-upstream = true + +[build-dependencies] +glibc = { path = "../glibc" } diff --git a/packages/libdrm/libdrm.spec b/packages/libdrm/libdrm.spec new file mode 100644 index 000000000..7e94d9cb8 --- /dev/null +++ b/packages/libdrm/libdrm.spec @@ -0,0 +1,70 @@ +Name: %{_cross_os}libdrm +Version: 2.4.128 +Release: 1%{?dist} +Summary: Direct Rendering Manager runtime library +License: MIT +URL: https://dri.freedesktop.org +Source0: https://dri.freedesktop.org/libdrm/libdrm-%{version}.tar.xz + +BuildRequires: %{_cross_os}glibc-devel +Requires: %{_cross_os}glibc + +%description +%{summary}. + +%package devel +Summary: Files for development using the direct rendering manager library +Requires: %{name} + +%description devel +%{summary}. + +%prep +%autosetup -n libdrm-%{version} -p1 + +%build +CONFIGURE_OPTS=( + --auto-features=disabled + -Dcairo-tests=disabled + -Dman-pages=disabled + -Dvalgrind=disabled + -Dfreedreno=disabled + -Dvc4=disabled + -Detnaviv=disabled + -Dexynos=disabled + -Dtegra=disabled + -Domap=disabled + -Dintel=disabled + -Dradeon=enabled + -Damdgpu=enabled + -Dnouveau=disabled + -Dtests=false +) + +%cross_meson "${CONFIGURE_OPTS[@]}" +%cross_meson_build + +%install +%cross_meson_install + +%files +%{_cross_attribution_file} +%{_cross_libdir}/libdrm.so.2 +%{_cross_libdir}/libdrm.so.2.128.0 +%{_cross_libdir}/libdrm_amdgpu.so.1 +%{_cross_libdir}/libdrm_amdgpu.so.1.128.0 +%{_cross_libdir}/libdrm_radeon.so.1 +%{_cross_libdir}/libdrm_radeon.so.1.128.0 +%{_cross_includedir}/libsync.h +%{_cross_datadir}/libdrm/amdgpu.ids + +%files devel +%{_cross_libdir}/libdrm.so +%{_cross_libdir}/libdrm_amdgpu.so +%{_cross_libdir}/libdrm_radeon.so +%{_cross_includedir}/libdrm/ +%{_cross_includedir}/xf86drm.h +%{_cross_includedir}/xf86drmMode.h +%{_cross_pkgconfigdir}/libdrm.pc +%{_cross_pkgconfigdir}/libdrm_amdgpu.pc +%{_cross_pkgconfigdir}/libdrm_radeon.pc diff --git a/packages/rocm-k8s-device-plugin/Cargo.toml b/packages/rocm-k8s-device-plugin/Cargo.toml new file mode 100644 index 000000000..580cb719c --- /dev/null +++ b/packages/rocm-k8s-device-plugin/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "rocm-k8s-device-plugin" +version = "0.1.0" +edition = "2021" +publish = false +build = "../build.rs" + +[lib] +path = "../packages.rs" + +[package.metadata.build-package] +releases-url = "https://github.com/ROCm/k8s-device-plugin/releases" + +[[package.metadata.build-package.external-files]] +url = "https://github.com/ROCm/k8s-device-plugin/archive/v1.31.0.8.tar.gz" +sha512 = "23a127b46ad15cabbdd9abe18a8b75140340dcb10f41c9efdcfd30b38db7142edabaf6a97ed2c621f0414c78c0cb87fdc81a30b5e3fb016d16cbd11210143326" +force-upstream = true + +[build-dependencies] +glibc = { path = "../glibc" } +libdrm = { path = "../libdrm" } +hwloc = { path = "../hwloc" } diff --git a/packages/rocm-k8s-device-plugin/rocm-k8s-device-plugin.service b/packages/rocm-k8s-device-plugin/rocm-k8s-device-plugin.service new file mode 100644 index 000000000..ae4209689 --- /dev/null +++ b/packages/rocm-k8s-device-plugin/rocm-k8s-device-plugin.service @@ -0,0 +1,19 @@ +[Unit] +Description=Start ROCm kubernetes device plugin +After=kubelet.service +Wants=kubelet.service + +[Service] +# Ensure that the kubelet device plugin socket exists before we start +# A brief sleep is needed to avoid the `test` failing its first check +ExecStartPre=/usr/bin/sleep 0.1 +ExecStartPre=/usr/bin/test -S /var/lib/kubelet/device-plugins/kubelet.sock +ExecStart=/usr/bin/rocm-device-plugin -logtostderr=true -stderrthreshold=INFO -v=5 +Type=simple +TimeoutSec=0 +RestartSec=2 +Restart=always +StandardError=journal+console + +[Install] +WantedBy=multi-user.target diff --git a/packages/rocm-k8s-device-plugin/rocm-k8s-device-plugin.spec b/packages/rocm-k8s-device-plugin/rocm-k8s-device-plugin.spec new file mode 100644 index 000000000..51d1763d2 --- /dev/null +++ b/packages/rocm-k8s-device-plugin/rocm-k8s-device-plugin.spec @@ -0,0 +1,75 @@ +%global goproject github.com/ROCm +%global gorepo k8s-device-plugin +%global goimport %{goproject}/%{gorepo} + +%global gover 1.31.0.8 +%global rpmver %{gover} + +Name: %{_cross_os}rocm-k8s-device-plugin +Version: %{rpmver} +Release: 1%{?dist} +Summary: Kubernetes device plugin for AMD GPUs +License: Apache-2.0 +URL: https://github.com/ROCm/k8s-device-plugin +Source0: https://github.com/ROCm/k8s-device-plugin/archive/v%{gover}.tar.gz +Source1: rocm-k8s-device-plugin.service + +BuildRequires: %{_cross_os}glibc-devel +BuildRequires: %{_cross_os}libdrm-devel +BuildRequires: %{_cross_os}hwloc-devel +Requires: %{name}(binaries) +Requires: %{_cross_os}libdrm +Requires: %{_cross_os}hwloc + +%description +%{summary}. + +%package bin +Summary: Kubernetes device plugin for AMD GPUs binaries +Provides: %{name}(binaries) +Requires: (%{_cross_os}image-feature(no-fips) and %{name}) +Conflicts: (%{_cross_os}image-feature(fips) or %{name}-fips-bin) + +%description bin +%{summary}. + +%package fips-bin +Summary: Kubernetes device plugin for AMD GPUs binaries, FIPS edition +Provides: %{name}(binaries) +Requires: (%{_cross_os}image-feature(fips) and %{name}) +Conflicts: (%{_cross_os}image-feature(no-fips) or %{name}-bin) + +%description fips-bin +%{summary}. + +%prep +%autosetup -n %{gorepo}-%{gover} -p1 +%cross_go_setup %{gorepo}-%{gover} %{goproject} %{goimport} + +%build +%cross_go_configure %{goimport} +%set_cross_go_flags + +go build -ldflags="${GOLDFLAGS}" -o amd-device-plugin ./cmd/k8s-device-plugin/ +gofips build -ldflags="${GOLDFLAGS}" -o fips/amd-device-plugin ./cmd/k8s-device-plugin/ + +%install +install -d %{buildroot}%{_cross_bindir} +install -p -m 0755 amd-device-plugin %{buildroot}%{_cross_bindir} + +install -d %{buildroot}%{_cross_fips_bindir} +install -p -m 0755 fips/amd-device-plugin %{buildroot}%{_cross_fips_bindir} + +install -d %{buildroot}%{_cross_unitdir} +install -p -m 0644 %{S:1} %{buildroot}%{_cross_unitdir} + +%files +%license LICENSE +%{_cross_attribution_file} +%{_cross_unitdir}/rocm-k8s-device-plugin.service + +%files bin +%{_cross_bindir}/amd-device-plugin + +%files fips-bin +%{_cross_fips_bindir}/amd-device-plugin diff --git a/sources/ghostdog/src/error.rs b/sources/ghostdog/src/error.rs index d086288e9..b3a171639 100644 --- a/sources/ghostdog/src/error.rs +++ b/sources/ghostdog/src/error.rs @@ -32,10 +32,14 @@ pub(super) enum Error { CheckEfaFailure { source: pciclient::PciClientError }, #[snafu(display("Failed to check if Neuron device is attached: {}", source))] CheckNeuronFailure { source: pciclient::PciClientError }, + #[snafu(display("Failed to check if AMD GPU device is attached: {}", source))] + CheckAmdGpuFailure { source: pciclient::PciClientError }, #[snafu(display("Did not detect EFA"))] NoEfaPresent, #[snafu(display("Did not detect Neuron"))] NoNeuronPresent, + #[snafu(display("Did not detect AMD GPU"))] + NoAmdGpuPresent, #[snafu(display("'{}' has no parent directory", path.display()))] NoParentDirectory { path: std::path::PathBuf }, #[snafu(display("Failed to open '{}': {}", path.display(), source))] diff --git a/sources/ghostdog/src/main.rs b/sources/ghostdog/src/main.rs index 42ad171d9..8abbf6154 100644 --- a/sources/ghostdog/src/main.rs +++ b/sources/ghostdog/src/main.rs @@ -58,6 +58,7 @@ enum SubCommand { EbsDeviceName(EbsDeviceNameArgs), EfaPresent(EfaPresentArgs), NeuronPresent(NeuronPresentArgs), + AmdGpuPresent(AmdGpuPresentArgs), MatchDriver(MatchDriverArgs), MatchNvidiaDriver(MatchNvidiaDriverArgs), WriteInfinibandGuid(WriteInfinibandGuidArgs), @@ -73,6 +74,11 @@ struct EfaPresentArgs {} /// Detect if Neuron devices are attached. struct NeuronPresentArgs {} +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "amd-gpu-present")] +/// Detect if AMD GPU devices are attached. +struct AmdGpuPresentArgs {} + #[derive(FromArgs, PartialEq, Debug)] #[argh(subcommand, name = "scan")] /// Scan a device to see if it is an ephemeral disk. @@ -163,6 +169,9 @@ fn main() -> Result<()> { SubCommand::NeuronPresent(_) => { is_neuron_attached()?; } + SubCommand::AmdGpuPresent(_) => { + is_amd_gpu_attached()?; + } SubCommand::MatchNvidiaDriver(driver) => { let driver_name = driver.driver_name; nvidia_driver_supported(&driver_name)?; @@ -203,6 +212,14 @@ fn is_neuron_attached() -> Result<()> { } } +fn is_amd_gpu_attached() -> Result<()> { + if pciclient::is_amd_gpu_attached().context(error::CheckAmdGpuFailureSnafu)? { + Ok(()) + } else { + Err(error::Error::NoAmdGpuPresent) + } +} + // Returns true if this is an inf1 instance fn is_inf1_instance() -> Result<()> { if pciclient::is_inf1_instance().context(error::CheckInf1FailureSnafu)? { diff --git a/sources/pciclient/src/lib.rs b/sources/pciclient/src/lib.rs index ed391eba3..6a79bd019 100644 --- a/sources/pciclient/src/lib.rs +++ b/sources/pciclient/src/lib.rs @@ -9,7 +9,7 @@ mod private; use private::{ call_list_devices, check_efa_attachment, check_inf1_attachment, check_inf2_attachment, - check_neuron_attachment, PciClient, + check_neuron_attachment, check_amd_gpu_attachment, PciClient, }; use bon::Builder; @@ -123,6 +123,11 @@ pub fn is_neuron_attached() -> Result { check_neuron_attachment(PciClient {}) } +/// Call `lspci` and check if there is any AMD GPU device attached. +pub fn is_amd_gpu_attached() -> Result { + check_amd_gpu_attachment(PciClient {}) +} + /// Call `lspci` and check if there are inf1 devices attached pub fn is_inf1_instance() -> Result { check_inf1_attachment(PciClient {}) diff --git a/sources/pciclient/src/private.rs b/sources/pciclient/src/private.rs index 7fb717569..e6f38068c 100644 --- a/sources/pciclient/src/private.rs +++ b/sources/pciclient/src/private.rs @@ -14,6 +14,7 @@ use crate::{ const AMAZON_VENDOR_CODE: &str = "1d0f"; const EFA_KEYWORD: &str = "efa"; +const AMD_VENDOR_CODE: &str = "1002"; // Neuron devices specific to inf1 instance types lazy_static! { @@ -34,6 +35,12 @@ lazy_static! { }; } +lazy_static! { + static ref AMD_GPU_DEVICES: HashSet<&'static str> = hashset! { + "75a3" // AMD Instinct MI355X + }; +} + const LSPCI_PATH: &str = "/usr/bin/lspci"; pub(crate) trait CommandExecutor { @@ -183,6 +190,19 @@ pub(crate) fn check_neuron_attachment(command_executor: T) - })) } +/// Call `lspci` and check if there is any AMD GPU device attached. +/// Internal usage, adding command_executor as parameter allows us to better unit test. +/// For external usage, check [`crate::is_amd_gpu_attached`]. +pub(crate) fn check_amd_gpu_attachment(command_executor: T) -> Result { + let list_devices_param = ListDevicesParam::builder() + .vendor(AMD_VENDOR_CODE.to_string()) + .build(); + let list_device_results = call_list_devices(command_executor, list_devices_param)?; + Ok(list_device_results + .iter() + .any(|device_info| AMD_GPU_DEVICES.contains(&device_info.device.as_str()))) +} + /// Call `lspci` and check if there is any inf1-specific Neuron device attached. /// Internal usage, adding command_executor as parameter allows us to better unit test. /// For external usage, check [`crate::is_inf1_instance`]. @@ -229,7 +249,7 @@ mod test { use super::{ call_list_devices, check_efa_attachment, check_inf1_attachment, check_neuron_attachment, - parse_list_devices_output, CommandExecutor, + check_amd_gpu_attachment, parse_list_devices_output, CommandExecutor, }; struct MockPciClient { @@ -436,6 +456,31 @@ mod test { assert!(check_inf1_attachment_result.unwrap()); } + #[test] + fn test_is_amd_gpu_attached() { + let mock_pci_client = MockPciClient { + // AMD Instinct MI355X device has vendor 1002 for AMD, device code 75a3. + output: vec![ + r#"00:1e.0 "0300" "1002" "75a3" -p00 "1002" "0123""#.to_string(), + r#"00:1f.0 "0302" "10de" "1eb8" -ra1 -p00 "10de" "12a2""#.to_string(), + ], + }; + let check_amd_gpu_attachment_result = check_amd_gpu_attachment(mock_pci_client); + assert!(check_amd_gpu_attachment_result.is_ok()); + assert!(check_amd_gpu_attachment_result.unwrap()); + } + + #[test] + fn test_is_amd_gpu_attached_negative_case() { + let mock_pci_client = MockPciClient { + // Below is an actual output from lspci for ena device (not AMD GPU). + output: vec![r#"00:06.0 "0200" "1d0f" "ec20" -p00 "1d0f" "ec20""#.to_string()], + }; + let check_amd_gpu_attachment_result = check_amd_gpu_attachment(mock_pci_client); + assert!(check_amd_gpu_attachment_result.is_ok()); + assert!(!check_amd_gpu_attachment_result.unwrap()); + } + #[test] fn test_is_inf1_attached_negative_case() { let mock_pci_client = MockPciClient {