Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 145 additions & 10 deletions src/extract_gpuinfo_nvidia.c
Original file line number Diff line number Diff line change
Expand Up @@ -607,26 +607,161 @@ static void gpuinfo_nvidia_refresh_dynamic_info(struct gpu_info *_gpu_info) {

// Device memory info (total,used,free)
bool got_meminfo = false;
bool has_unified_memory = false;

if (nvmlDeviceGetMemoryInfo_v2) {
nvmlMemory_v2_t memory_info;
memory_info.version = 2;
last_nvml_return_status = nvmlDeviceGetMemoryInfo_v2(device, &memory_info);
if (last_nvml_return_status == NVML_SUCCESS) {
got_meminfo = true;
SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, memory_info.total);
SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, memory_info.used);
SET_GPUINFO_DYNAMIC(dynamic_info, free_memory, memory_info.free);
SET_GPUINFO_DYNAMIC(dynamic_info, mem_util_rate, memory_info.used * 100 / memory_info.total);
// Check if this is a unified memory GPU (total == 0 indicates unified memory)
if (memory_info.total == 0) {
has_unified_memory = true;
} else {
got_meminfo = true;
SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, memory_info.total);
SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, memory_info.used);
SET_GPUINFO_DYNAMIC(dynamic_info, free_memory, memory_info.free);
SET_GPUINFO_DYNAMIC(dynamic_info, mem_util_rate, memory_info.used * 100 / memory_info.total);
}
} else {
// Memory query failed - likely unified memory GPU (error code 13 = NOT_SUPPORTED)
has_unified_memory = true;
}
}
if (!got_meminfo && nvmlDeviceGetMemoryInfo) {
if (!got_meminfo && !has_unified_memory && nvmlDeviceGetMemoryInfo) {
nvmlMemory_v1_t memory_info;
last_nvml_return_status = nvmlDeviceGetMemoryInfo(device, &memory_info);
if (last_nvml_return_status == NVML_SUCCESS) {
SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, memory_info.total);
SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, memory_info.used);
SET_GPUINFO_DYNAMIC(dynamic_info, free_memory, memory_info.free);
SET_GPUINFO_DYNAMIC(dynamic_info, mem_util_rate, memory_info.used * 100 / memory_info.total);
// Check if this is a unified memory GPU (total == 0 indicates unified memory)
if (memory_info.total == 0) {
has_unified_memory = true;
} else {
SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, memory_info.total);
SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, memory_info.used);
SET_GPUINFO_DYNAMIC(dynamic_info, free_memory, memory_info.free);
SET_GPUINFO_DYNAMIC(dynamic_info, mem_util_rate, memory_info.used * 100 / memory_info.total);
}
} else {
// Memory query failed - likely unified memory GPU
has_unified_memory = true;
}
}

// Handle unified memory GPUs - query actual GPU allocations and system memory
if (has_unified_memory) {
// Get actual GPU memory usage from running processes
unsigned long long gpu_used_memory = 0;

// Sum up memory used by compute processes
if (nvmlDeviceGetComputeRunningProcesses_v3 || nvmlDeviceGetComputeRunningProcesses_v2 ||
nvmlDeviceGetComputeRunningProcesses_v1) {
unsigned int process_count = 0;
nvmlReturn_t (*getProcesses)(nvmlDevice_t, unsigned int *, void *) = NULL;
size_t process_info_size = 0;

// Choose the latest available version
if (nvmlDeviceGetComputeRunningProcesses_v3) {
getProcesses = nvmlDeviceGetComputeRunningProcesses[3];
process_info_size = sizeof(nvmlProcessInfo_v3_t);
} else if (nvmlDeviceGetComputeRunningProcesses_v2) {
getProcesses = nvmlDeviceGetComputeRunningProcesses[2];
process_info_size = sizeof(nvmlProcessInfo_v2_t);
} else {
getProcesses = nvmlDeviceGetComputeRunningProcesses[1];
process_info_size = sizeof(nvmlProcessInfo_v1_t);
}

// First call to get count
nvmlReturn_t ret = getProcesses(device, &process_count, NULL);
if (ret == NVML_SUCCESS || ret == NVML_ERROR_INSUFFICIENT_SIZE) {
if (process_count > 0) {
void *process_infos = malloc(process_count * process_info_size);
if (process_infos) {
ret = getProcesses(device, &process_count, process_infos);
if (ret == NVML_SUCCESS) {
// Sum up memory from all processes
for (unsigned int i = 0; i < process_count; i++) {
if (nvmlDeviceGetComputeRunningProcesses_v3) {
gpu_used_memory += ((nvmlProcessInfo_v3_t *)process_infos)[i].usedGpuMemory;
} else if (nvmlDeviceGetComputeRunningProcesses_v2) {
gpu_used_memory += ((nvmlProcessInfo_v2_t *)process_infos)[i].usedGpuMemory;
} else {
gpu_used_memory += ((nvmlProcessInfo_v1_t *)process_infos)[i].usedGpuMemory;
}
}
}
free(process_infos);
}
}
}
}

// Also check graphics processes
if (nvmlDeviceGetGraphicsRunningProcesses_v3 || nvmlDeviceGetGraphicsRunningProcesses_v2 ||
nvmlDeviceGetGraphicsRunningProcesses_v1) {
unsigned int process_count = 0;
nvmlReturn_t (*getProcesses)(nvmlDevice_t, unsigned int *, void *) = NULL;
size_t process_info_size = 0;

if (nvmlDeviceGetGraphicsRunningProcesses_v3) {
getProcesses = nvmlDeviceGetGraphicsRunningProcesses[3];
process_info_size = sizeof(nvmlProcessInfo_v3_t);
} else if (nvmlDeviceGetGraphicsRunningProcesses_v2) {
getProcesses = nvmlDeviceGetGraphicsRunningProcesses[2];
process_info_size = sizeof(nvmlProcessInfo_v2_t);
} else {
getProcesses = nvmlDeviceGetGraphicsRunningProcesses[1];
process_info_size = sizeof(nvmlProcessInfo_v1_t);
}

nvmlReturn_t ret = getProcesses(device, &process_count, NULL);
if (ret == NVML_SUCCESS || ret == NVML_ERROR_INSUFFICIENT_SIZE) {
if (process_count > 0) {
void *process_infos = malloc(process_count * process_info_size);
if (process_infos) {
ret = getProcesses(device, &process_count, process_infos);
if (ret == NVML_SUCCESS) {
for (unsigned int i = 0; i < process_count; i++) {
if (nvmlDeviceGetGraphicsRunningProcesses_v3) {
gpu_used_memory += ((nvmlProcessInfo_v3_t *)process_infos)[i].usedGpuMemory;
} else if (nvmlDeviceGetGraphicsRunningProcesses_v2) {
gpu_used_memory += ((nvmlProcessInfo_v2_t *)process_infos)[i].usedGpuMemory;
} else {
gpu_used_memory += ((nvmlProcessInfo_v1_t *)process_infos)[i].usedGpuMemory;
}
}
}
free(process_infos);
}
}
}
}

// Read MemAvailable from /proc/meminfo for available memory
FILE *meminfo = fopen("/proc/meminfo", "r");
if (meminfo) {
unsigned long long available_ram = 0;
char line[256];

while (fgets(line, sizeof(line), meminfo)) {
if (sscanf(line, "MemAvailable: %llu kB", &available_ram) == 1) {
available_ram *= 1024; // Convert KB to bytes
break;
}
}
fclose(meminfo);

if (available_ram > 0) {
unsigned long long total_memory = gpu_used_memory + available_ram;

SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, total_memory);
SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, gpu_used_memory);
SET_GPUINFO_DYNAMIC(dynamic_info, free_memory, available_ram);
if (total_memory > 0) {
SET_GPUINFO_DYNAMIC(dynamic_info, mem_util_rate, gpu_used_memory * 100 / total_memory);
}
}
}
}

Expand Down
Loading