Skip to content

Commit 896c93e

Browse files
committed
Detect AMD GPUs without lspci
shelling out to lspci and relying upon pciids.txt is fragile, especially when trying to run on an old distro. To avoid risk with this instead parse kfd sysfs files to determine if an APU or dGPU and build the graphics architecture. Signed-off-by: Mario Limonciello (AMD) <superm1@kernel.org>
1 parent 36167aa commit 896c93e

File tree

3 files changed

+158
-197
lines changed

3 files changed

+158
-197
lines changed

src/cpp/server/system_info.cpp

Lines changed: 79 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,17 @@ bool SystemInfo::check_vulkan_support() {
379379
static std::string identify_rocm_arch_from_name(const std::string& device_name) {
380380
std::string device_lower = device_name;
381381
std::transform(device_lower.begin(), device_lower.end(), device_lower.begin(), ::tolower);
382-
382+
383+
// linux will pass the ISA from KFD, transform it to what the rest of lemonade expects
384+
if (std::all_of(device_lower.begin(), device_lower.end(), ::isdigit)) {
385+
if (device_lower.length() >= 4) {
386+
std::string major = device_lower.substr(0, 2);
387+
std::string minor = device_lower.substr(2, 1);
388+
std::string revision = device_lower.substr(3, 1);
389+
return "gfx" + major + minor + revision;
390+
}
391+
}
392+
383393
if (device_lower.find("radeon") == std::string::npos) {
384394
return "";
385395
}
@@ -1290,92 +1300,85 @@ NPUInfo LinuxSystemInfo::get_npu_device() {
12901300

12911301
std::vector<GPUInfo> LinuxSystemInfo::detect_amd_gpus(const std::string& gpu_type) {
12921302
std::vector<GPUInfo> gpus;
1293-
1294-
// Execute lspci to find GPUs
1295-
FILE* pipe = popen("lspci 2>/dev/null | grep -iE 'vga|3d|display'", "r");
1296-
if (!pipe) {
1303+
std::string kfd_path = "/sys/class/kfd/kfd/topology/nodes";
1304+
1305+
if (!fs::exists(kfd_path)) {
12971306
GPUInfo gpu;
12981307
gpu.available = false;
1299-
gpu.error = "Failed to execute lspci command";
1308+
gpu.error = "No KFD nodes found (AMD GPU driver not loaded or no GPU present)";
13001309
gpus.push_back(gpu);
13011310
return gpus;
13021311
}
1303-
1304-
char buffer[512];
1305-
std::vector<std::string> lspci_lines;
1306-
while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
1307-
lspci_lines.push_back(buffer);
1308-
}
1309-
pclose(pipe);
1310-
1311-
// Parse AMD GPUs
1312-
for (const auto& line : lspci_lines) {
1313-
if (line.find("AMD") != std::string::npos || line.find("ATI") != std::string::npos) {
1314-
// Extract device name
1315-
std::string name;
1316-
size_t pos = line.find(": ");
1317-
if (pos != std::string::npos) {
1318-
name = line.substr(pos + 2);
1319-
// Remove newline
1320-
if (!name.empty() && name.back() == '\n') {
1321-
name.pop_back();
1312+
1313+
for (const auto& node_entry : fs::directory_iterator(kfd_path)) {
1314+
if (!node_entry.is_directory()) continue;
1315+
1316+
std::string node_path = node_entry.path().string();
1317+
std::string properties_file = node_path + "/properties";
1318+
1319+
if (!fs::exists(properties_file)) continue;
1320+
1321+
std::ifstream props(properties_file);
1322+
if (!props.is_open()) continue;
1323+
1324+
std::string line;
1325+
std::string drm_render_minor;
1326+
std::string gfx_target_version;
1327+
1328+
bool is_gpu = false;
1329+
1330+
while (std::getline(props, line)) {
1331+
if (line.find("gfx_target_version") == 0) {
1332+
gfx_target_version = line.substr(line.find(" ") + 1);
1333+
gfx_target_version.erase(gfx_target_version.find_last_not_of(" \t\n\r") + 1);
1334+
if (!gfx_target_version.empty() && std::stoi(gfx_target_version) != 0) {
1335+
is_gpu = true;
13221336
}
1323-
} else {
1324-
name = line;
1337+
} else if (line.find("drm_render_minor") == 0) {
1338+
drm_render_minor = line.substr(line.find(" ") + 1);
1339+
drm_render_minor.erase(drm_render_minor.find_last_not_of(" \t\n\r") + 1);
13251340
}
1341+
}
1342+
props.close();
1343+
1344+
if (!is_gpu || drm_render_minor.empty() || drm_render_minor == "-1")
1345+
continue;
1346+
1347+
1348+
std::string device_path = "/sys/class/drm/renderD" + drm_render_minor + "/device/";
1349+
std::string board_info_path = device_path + "board_info";
1350+
bool is_integrated = !fs::exists(board_info_path) && fs::is_regular_file(board_info_path);
1351+
1352+
GPUInfo gpu;
1353+
gpu.name = gfx_target_version;
1354+
gpu.available = true;
1355+
1356+
// Get VRAM for discrete GPUs
1357+
if (!is_integrated) {
1358+
std::string vram_file = device_path + "/mem_info_vram_total";
1359+
if (!fs::exists(vram_file))
1360+
continue;
1361+
std::ifstream vram_stream(vram_file);
1362+
std::string vram_str;
1363+
std::getline(vram_stream, vram_str);
1364+
uint64_t vram_bytes = std::stoull(vram_str);
1365+
gpu.vram_gb = std::round(vram_bytes / (1024.0 * 1024.0 * 1024.0) * 10.0) / 10.0;
13261366

1327-
// Classify as discrete or integrated using keywords
1328-
std::string name_lower = name;
1329-
std::transform(name_lower.begin(), name_lower.end(), name_lower.begin(), ::tolower);
1330-
1331-
bool is_discrete = false;
1332-
for (const auto& keyword : AMD_DISCRETE_GPU_KEYWORDS) {
1333-
if (name_lower.find(keyword) != std::string::npos) {
1334-
is_discrete = true;
1335-
break;
1336-
}
1337-
}
1338-
bool is_integrated = !is_discrete;
1367+
// Detect inference engines
1368+
std::string device_type = is_integrated ? "amd_igpu" : "amd_dgpu";
1369+
gpu.inference_engines = detect_inference_engines(device_type, gfx_target_version);
13391370

1340-
// Filter based on requested type
1341-
if ((gpu_type == "integrated" && is_integrated) ||
1342-
(gpu_type == "discrete" && is_discrete)) {
1343-
1344-
GPUInfo gpu;
1345-
gpu.name = name;
1346-
gpu.available = true;
1347-
1348-
// Get VRAM for discrete GPUs
1349-
if (is_discrete) {
1350-
// Extract PCI ID from lspci line (first field)
1351-
std::string pci_id = line.substr(0, line.find(" "));
1352-
1353-
double vram = get_amd_vram_rocm_smi();
1354-
if (vram == 0.0) {
1355-
vram = get_amd_vram_sysfs(pci_id);
1356-
}
1357-
1358-
if (vram > 0.0) {
1359-
gpu.vram_gb = vram;
1360-
}
1361-
}
1362-
1363-
// Detect inference engines
1364-
std::string device_type = is_integrated ? "amd_igpu" : "amd_dgpu";
1365-
gpu.inference_engines = detect_inference_engines(device_type, name);
1366-
1367-
gpus.push_back(gpu);
1368-
}
1371+
gpus.push_back(gpu);
1372+
}
1373+
1374+
if (gpus.empty()) {
1375+
GPUInfo gpu;
1376+
gpu.available = false;
1377+
gpu.error = "No AMD " + gpu_type + " GPU found in KFD nodes";
1378+
gpus.push_back(gpu);
13691379
}
13701380
}
1371-
1372-
if (gpus.empty()) {
1373-
GPUInfo gpu;
1374-
gpu.available = false;
1375-
gpu.error = "No AMD " + gpu_type + " GPU found";
1376-
gpus.push_back(gpu);
1377-
}
1378-
1381+
13791382
return gpus;
13801383
}
13811384

src/lemonade/common/system_info.py

Lines changed: 62 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import re
66
import subprocess
77
import ctypes
8-
import glob
8+
import os
99
from .inference_engines import detect_inference_engines
1010

1111
# AMD GPU classification keywords - shared across all OS implementations
@@ -555,7 +555,6 @@ def _get_gpu_vram_dxdiag_simple(self, gpu_name: str) -> float:
555555
"""
556556
try:
557557
import tempfile
558-
import os
559558

560559
with tempfile.NamedTemporaryFile(
561560
mode="w+", suffix=".txt", delete=False
@@ -917,51 +916,60 @@ def _detect_amd_gpus(self, gpu_type: str, include_inference_engines: bool = Fals
917916
"""
918917
gpu_devices = []
919918
try:
920-
lspci_output = subprocess.check_output(
921-
"lspci | grep -i 'vga\\|3d\\|display'", shell=True
922-
).decode()
923-
924-
for line in lspci_output.split("\n"):
925-
if line.strip() and "AMD" in line:
926-
name_lower = line.lower()
927-
928-
# Keyword-based classification - simple and reliable
929-
is_discrete_by_name = any(
930-
kw in name_lower for kw in AMD_DISCRETE_GPU_KEYWORDS
931-
)
932-
is_integrated = not is_discrete_by_name
933-
934-
# Filter based on requested type
935-
if (gpu_type == "integrated" and is_integrated) or (
936-
gpu_type == "discrete" and not is_integrated
937-
):
938-
939-
device_type = "amd_igpu" if is_integrated else "amd_dgpu"
940-
device_name = line.split(": ")[1] if ": " in line else line
941-
942-
gpu_info = {
943-
"name": device_name,
944-
"available": True,
945-
}
946-
947-
# Get VRAM information for discrete GPUs
948-
if not is_integrated: # Only add VRAM for discrete GPUs
949-
vram_gb = self._get_amd_vram_rocm_smi_linux()
950-
if vram_gb == 0.0:
951-
# Fallback to sysfs - extract PCI ID from lspci line
952-
pci_id = line.split()[0] if line else ""
953-
vram_gb = self._get_amd_vram_sysfs(pci_id)
954-
955-
if vram_gb > 0.0:
956-
gpu_info["vram_gb"] = vram_gb
957-
else:
958-
gpu_info["vram_gb"] = "Unknown"
919+
gpu_info = {}
920+
kfd_dir = "/sys/class/kfd/kfd/topology/nodes/"
921+
if not os.path.isdir(kfd_dir):
922+
return gpu_devices
923+
924+
for node in os.listdir(kfd_dir):
925+
isa = 0
926+
render_minor = None
927+
properties_path = os.path.join(kfd_dir, node, "properties")
928+
if os.path.isfile(properties_path):
929+
try:
930+
with open(properties_path, "r", encoding="utf-8") as f:
931+
for line in f:
932+
if "gfx_target_version" in line:
933+
isa = line.split()[1].strip()
934+
if "drm_render_minor" in line:
935+
render_minor = line.split()[1].strip()
936+
except (OSError, ValueError):
937+
continue
938+
if int(isa) == 0:
939+
continue
940+
gpu_info = {
941+
"name": f"AMD GPU Node {node}",
942+
"isa": isa,
943+
"drm_render_minor": render_minor,
944+
"available": True,
945+
}
959946

960-
if include_inference_engines:
961-
gpu_info["inference_engines"] = (
962-
self._detect_inference_engines(device_type, device_name)
963-
)
964-
gpu_devices.append(gpu_info)
947+
device_path = (
948+
f"/sys/class/drm/renderD{gpu_info['drm_render_minor']}/device/"
949+
)
950+
board_info_path = os.path.join(device_path, "board_info")
951+
952+
is_discrete = os.path.isfile(board_info_path)
953+
is_integrated = not is_discrete
954+
if (
955+
is_integrated
956+
and gpu_type != "integrated"
957+
or (is_discrete and gpu_type != "discrete")
958+
):
959+
return gpu_devices
960+
961+
# Filter based on requested type
962+
device_type = "amd_igpu" if is_integrated else "amd_dgpu"
963+
964+
# Get VRAM information for discrete GPUs
965+
if is_discrete:
966+
gpu_info["vram_gb"] = self._get_amd_vram_sysfs(device_path)
967+
968+
if include_inference_engines:
969+
gpu_info["inference_engines"] = self._detect_inference_engines(
970+
device_type, gpu_info["isa"]
971+
)
972+
gpu_devices.append(gpu_info)
965973

966974
except Exception as e: # pylint: disable=broad-except
967975
error_msg = f"AMD {gpu_type} GPU detection failed: {e}"
@@ -1216,79 +1224,21 @@ def _get_nvidia_vram_smi_linux(self) -> float:
12161224
pass
12171225
return 0.0
12181226

1219-
def _get_amd_vram_rocm_smi_linux(self) -> float:
1220-
"""
1221-
Get AMD GPU VRAM using rocm-smi command on Linux.
1222-
1223-
Returns:
1224-
float: VRAM in GB, or 0.0 if detection fails
1225-
"""
1226-
try:
1227-
output = (
1228-
subprocess.check_output(
1229-
["rocm-smi", "--showmeminfo", "vram", "--csv"],
1230-
stderr=subprocess.DEVNULL,
1231-
)
1232-
.decode()
1233-
.strip()
1234-
)
1235-
1236-
# Parse CSV output to extract VRAM
1237-
lines = output.split("\n")
1238-
for line in lines:
1239-
if "Total VRAM" in line or "vram" in line.lower():
1240-
# Extract numeric value (assuming it's in MB or GB)
1241-
numbers = re.findall(r"\d+", line)
1242-
if numbers:
1243-
vram_value = int(numbers[0])
1244-
# Assume MB if value is large, GB if small
1245-
if vram_value > 100: # Likely MB
1246-
vram_gb = round(vram_value / 1024, 1)
1247-
else: # Likely GB
1248-
vram_gb = float(vram_value)
1249-
return vram_gb
1250-
except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
1251-
pass
1252-
return 0.0
1253-
1254-
def _get_amd_vram_sysfs(self, pci_id: str) -> float:
1227+
def _get_amd_vram_sysfs(self, device: str) -> float:
12551228
"""
12561229
Get AMD GPU VRAM using sysfs on Linux.
12571230
12581231
Args:
1259-
pci_id: PCI ID of the GPU (e.g., "0000:01:00.0")
1232+
device: base device path
12601233
12611234
Returns:
1262-
float: VRAM in GB, or 0.0 if detection fails
1235+
float: VRAM in GBfails
12631236
"""
1264-
try:
1265-
# Try different sysfs paths for VRAM information
1266-
sysfs_paths = [
1267-
f"/sys/bus/pci/devices/{pci_id}/mem_info_vram_total",
1268-
"/sys/class/drm/card*/device/mem_info_vram_total",
1269-
]
1270-
1271-
for path in sysfs_paths:
1272-
try:
1273-
if "*" in path:
1274-
# Handle wildcard paths
1275-
matching_paths = glob.glob(path)
1276-
for match_path in matching_paths:
1277-
with open(match_path, "r", encoding="utf-8") as f:
1278-
vram_bytes = int(f.read().strip())
1279-
vram_gb = round(vram_bytes / (1024**3), 1)
1280-
if vram_gb > 0:
1281-
return vram_gb
1282-
else:
1283-
with open(path, "r", encoding="utf-8") as f:
1284-
vram_bytes = int(f.read().strip())
1285-
vram_gb = round(vram_bytes / (1024**3), 1)
1286-
return vram_gb
1287-
except (FileNotFoundError, ValueError, PermissionError):
1288-
continue
1289-
except Exception: # pylint: disable=broad-except
1290-
pass
1291-
return 0.0
1237+
path = os.path.join(device, "mem_info_vram_total")
1238+
with open(path, "r", encoding="utf-8") as f:
1239+
vram_bytes = int(f.read().strip())
1240+
vram_gb = round(vram_bytes / (1024**3), 1)
1241+
return vram_gb
12921242

12931243
def _detect_inference_engines(self, device_type: str, device_name: str) -> dict:
12941244
"""

0 commit comments

Comments
 (0)