Skip to content

Commit 7353a7d

Browse files
committed
feat: gpustack add gpu utilization and uuid for mac/linux nvidia
1 parent b001ea4 commit 7353a7d

File tree

14 files changed

+205
-41
lines changed

14 files changed

+205
-41
lines changed

src/detection/gpu/gpu.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#define FF_GPU_CORE_COUNT_UNSET -1
77
#define FF_GPU_VMEM_SIZE_UNSET ((uint64_t)-1)
88
#define FF_GPU_FREQUENCY_UNSET (0/0.0)
9+
#define FF_GPU_CORE_UTILIZATION_RATE_UNSET -1
910

1011
extern const char* FF_GPU_VENDOR_NAME_APPLE;
1112
extern const char* FF_GPU_VENDOR_NAME_AMD;
@@ -32,10 +33,12 @@ typedef struct FFGPUResult
3233
FFstrbuf platformApi;
3334
double temperature;
3435
int32_t coreCount;
36+
double coreUtilizationRate;
3537
double frequency; // Real time clock frequency in GHz
3638
FFGPUMemory dedicated;
3739
FFGPUMemory shared;
3840
uint64_t deviceId; // Used internally, may be uninitialized
41+
FFstrbuf uuid; // Identifier for the GPU
3942
} FFGPUResult;
4043

4144
const char* ffDetectGPU(const FFGPUOptions* options, FFlist* result);

src/detection/gpu/gpu_apple.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,16 @@ const char* ffDetectGPUImpl(const FFGPUOptions* options, FFlist* gpus)
6666
if(ffCfDictGetInt(properties, CFSTR("gpu-core-count"), &gpu->coreCount)) // For Apple
6767
gpu->coreCount = FF_GPU_CORE_COUNT_UNSET;
6868

69+
gpu->coreUtilizationRate = FF_GPU_CORE_UTILIZATION_RATE_UNSET;
70+
if (CFDictionaryContainsKey(properties, CFSTR("PerformanceStatistics")))
71+
{
72+
CFDictionaryRef performanceDict;
73+
if (!ffCfDictGetDict(properties, CFSTR("PerformanceStatistics"), &performanceDict))
74+
{
75+
ffCfDictGetDouble(performanceDict, CFSTR("Device Utilization %"), &gpu->coreUtilizationRate);
76+
}
77+
}
78+
6979
ffStrbufInit(&gpu->name);
7080
//IOAccelerator returns model / vendor-id properties for Apple Silicon, but not for Intel Iris GPUs.
7181
//Still needs testing for AMD's
@@ -84,6 +94,8 @@ const char* ffDetectGPUImpl(const FFGPUOptions* options, FFlist* gpus)
8494
ffCfDictGetString(properties, CFSTR("model"), &gpu->name);
8595
}
8696

97+
gpu->uuid = ffStrbufGetUUID(&gpu->name);
98+
8799
ffStrbufInit(&gpu->vendor);
88100
int vendorId;
89101
if(!ffCfDictGetInt(properties, CFSTR("vendor-id"), &vendorId))

src/detection/gpu/gpu_driver_specific.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,13 @@ typedef struct FFGpuDriverCondition
3737
// detect x if not NULL
3838
typedef struct FFGpuDriverResult
3939
{
40+
FFstrbuf *uuid;
4041
double* temp;
4142
FFGPUMemory* memory;
4243
uint32_t* coreCount;
4344
FFGPUType* type;
4445
double* frequency;
46+
double *coreUtilizationRate;
4547
} FFGpuDriverResult;
4648

4749
const char* ffDetectNvidiaGpuInfo(const FFGpuDriverCondition* cond, FFGpuDriverResult result, const char* soName);

src/detection/gpu/gpu_linux.c

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -200,10 +200,12 @@ static const char* detectPci(const FFGPUOptions* options, FFlist* gpus, FFstrbuf
200200
FFGPUResult* gpu = (FFGPUResult*)ffListAdd(gpus);
201201
ffStrbufInitStatic(&gpu->vendor, ffGetGPUVendorString((uint16_t) vendorId));
202202
ffStrbufInit(&gpu->name);
203+
ffStrbufInit(&gpu->uuid);
203204
ffStrbufInit(&gpu->driver);
204205
ffStrbufInit(&gpu->platformApi);
205206
gpu->temperature = FF_GPU_TEMP_UNSET;
206207
gpu->coreCount = FF_GPU_CORE_COUNT_UNSET;
208+
gpu->coreUtilizationRate = FF_GPU_CORE_UTILIZATION_RATE_UNSET;
207209
gpu->type = FF_GPU_TYPE_UNKNOWN;
208210
gpu->dedicated.total = gpu->dedicated.used = gpu->shared.total = gpu->shared.used = FF_GPU_VMEM_SIZE_UNSET;
209211
gpu->deviceId = ((uint64_t) pciDomain << 6) | ((uint64_t) pciBus << 4) | (deviceId << 2) | pciFunc;
@@ -260,21 +262,25 @@ static const char* detectPci(const FFGPUOptions* options, FFlist* gpus, FFstrbuf
260262
{
261263
if (options->temp || options->driverSpecific)
262264
{
263-
ffDetectNvidiaGpuInfo(&(FFGpuDriverCondition) {
264-
.type = FF_GPU_DRIVER_CONDITION_TYPE_BUS_ID,
265-
.pciBusId = {
266-
.domain = pciDomain,
267-
.bus = pciBus,
268-
.device = pciDevice,
269-
.func = pciFunc,
265+
ffDetectNvidiaGpuInfo(&(FFGpuDriverCondition){
266+
.type = FF_GPU_DRIVER_CONDITION_TYPE_BUS_ID,
267+
.pciBusId = {
268+
.domain = pciDomain,
269+
.bus = pciBus,
270+
.device = pciDevice,
271+
.func = pciFunc,
272+
},
270273
},
271-
}, (FFGpuDriverResult) {
272-
.temp = options->temp ? &gpu->temperature : NULL,
273-
.memory = options->driverSpecific ? &gpu->dedicated : NULL,
274-
.coreCount = options->driverSpecific ? (uint32_t*) &gpu->coreCount : NULL,
275-
.type = &gpu->type,
276-
.frequency = options->driverSpecific ? &gpu->frequency : NULL,
277-
}, "libnvidia-ml.so");
274+
(FFGpuDriverResult){
275+
.temp = options->temp ? &gpu->temperature : NULL,
276+
.memory = options->driverSpecific ? &gpu->dedicated : NULL,
277+
.coreCount = options->driverSpecific ? (uint32_t *)&gpu->coreCount : NULL,
278+
.type = &gpu->type,
279+
.frequency = options->driverSpecific ? &gpu->frequency : NULL,
280+
.coreUtilizationRate = &gpu->coreUtilizationRate,
281+
.uuid = &gpu->uuid,
282+
},
283+
"libnvidia-ml.so");
278284
}
279285

280286
if (gpu->type == FF_GPU_TYPE_UNKNOWN)

src/detection/gpu/gpu_nvidia.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,12 @@ struct FFNvmlData {
1010
FF_LIBRARY_SYMBOL(nvmlDeviceGetPciInfo_v3)
1111
FF_LIBRARY_SYMBOL(nvmlDeviceGetTemperature)
1212
FF_LIBRARY_SYMBOL(nvmlDeviceGetMemoryInfo_v2)
13+
FF_LIBRARY_SYMBOL(nvmlDeviceGetMemoryInfo)
1314
FF_LIBRARY_SYMBOL(nvmlDeviceGetNumGpuCores)
1415
FF_LIBRARY_SYMBOL(nvmlDeviceGetMaxClockInfo)
1516
FF_LIBRARY_SYMBOL(nvmlDeviceGetBrand)
17+
FF_LIBRARY_SYMBOL(nvmlDeviceGetUtilizationRates)
18+
FF_LIBRARY_SYMBOL(nvmlDeviceGetUUID)
1619

1720
bool inited;
1821
} nvmlData;
@@ -33,9 +36,12 @@ const char* ffDetectNvidiaGpuInfo(const FFGpuDriverCondition* cond, FFGpuDriverR
3336
FF_LIBRARY_LOAD_SYMBOL_VAR_MESSAGE(libnvml, nvmlData, nvmlDeviceGetPciInfo_v3)
3437
FF_LIBRARY_LOAD_SYMBOL_VAR_MESSAGE(libnvml, nvmlData, nvmlDeviceGetTemperature)
3538
FF_LIBRARY_LOAD_SYMBOL_VAR_MESSAGE(libnvml, nvmlData, nvmlDeviceGetMemoryInfo_v2)
39+
FF_LIBRARY_LOAD_SYMBOL_VAR_MESSAGE(libnvml, nvmlData, nvmlDeviceGetMemoryInfo)
3640
FF_LIBRARY_LOAD_SYMBOL_VAR_MESSAGE(libnvml, nvmlData, nvmlDeviceGetNumGpuCores)
3741
FF_LIBRARY_LOAD_SYMBOL_VAR_MESSAGE(libnvml, nvmlData, nvmlDeviceGetMaxClockInfo)
3842
FF_LIBRARY_LOAD_SYMBOL_VAR_MESSAGE(libnvml, nvmlData, nvmlDeviceGetBrand)
43+
FF_LIBRARY_LOAD_SYMBOL_VAR_MESSAGE(libnvml, nvmlData, nvmlDeviceGetUtilizationRates)
44+
FF_LIBRARY_LOAD_SYMBOL_VAR_MESSAGE(libnvml, nvmlData, nvmlDeviceGetUUID)
3945

4046
if (ffnvmlInit_v2() != NVML_SUCCESS)
4147
{
@@ -116,6 +122,16 @@ const char* ffDetectNvidiaGpuInfo(const FFGpuDriverCondition* cond, FFGpuDriverR
116122
result.memory->total = memory.used + memory.free;
117123
result.memory->used = memory.used;
118124
}
125+
else
126+
{
127+
nvmlMemory_t memory_v1;
128+
nvmlReturn_t r2 = nvmlData.ffnvmlDeviceGetMemoryInfo(device, &memory_v1);
129+
if (r2 == NVML_SUCCESS)
130+
{
131+
result.memory->total = memory_v1.total;
132+
result.memory->used = memory_v1.used;
133+
}
134+
}
119135
}
120136

121137
if (result.coreCount)
@@ -128,6 +144,20 @@ const char* ffDetectNvidiaGpuInfo(const FFGpuDriverCondition* cond, FFGpuDriverR
128144
*result.frequency = clockMHz / 1000.;
129145
}
130146

147+
if (result.coreUtilizationRate)
148+
{
149+
nvmlUtilization_t utilization;
150+
if (nvmlData.ffnvmlDeviceGetUtilizationRates(device, &utilization) == NVML_SUCCESS)
151+
*result.coreUtilizationRate = utilization.gpu;
152+
}
153+
154+
if (result.uuid)
155+
{
156+
char uuid[NVML_DEVICE_UUID_V2_BUFFER_SIZE];
157+
if (nvmlData.ffnvmlDeviceGetUUID(device, uuid, sizeof(uuid)) == NVML_SUCCESS)
158+
*result.uuid = ffStrbufCreateStatic(uuid);
159+
}
160+
131161
return NULL;
132162

133163
#else

src/detection/gpu/gpu_windows.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ const char* ffDetectGPUImpl(FF_MAYBE_UNUSED const FFGPUOptions* options, FFlist*
8686
gpu->dedicated.total = gpu->dedicated.used = gpu->shared.total = gpu->shared.used = FF_GPU_VMEM_SIZE_UNSET;
8787
gpu->deviceId = 0;
8888
gpu->frequency = FF_GPU_FREQUENCY_UNSET;
89+
gpu->coreUtilizationRate = FF_GPU_CORE_UTILIZATION_RATE_UNSET;
8990

9091
if (deviceKeyLength == 100 && displayDevice.DeviceKey[deviceKeyPrefixLength - 1] == '{')
9192
{
@@ -147,7 +148,7 @@ const char* ffDetectGPUImpl(FF_MAYBE_UNUSED const FFGPUOptions* options, FFlist*
147148
if (vendorId && deviceId && subSystemId)
148149
{
149150
detectFn(
150-
&(FFGpuDriverCondition) {
151+
&(FFGpuDriverCondition){
151152
.type = FF_GPU_DRIVER_CONDITION_TYPE_DEVICE_ID | FF_GPU_DRIVER_CONDITION_TYPE_LUID,
152153
.pciDeviceId = {
153154
.deviceId = deviceId,
@@ -157,15 +158,15 @@ const char* ffDetectGPUImpl(FF_MAYBE_UNUSED const FFGPUOptions* options, FFlist*
157158
},
158159
.luid = gpu->deviceId,
159160
},
160-
(FFGpuDriverResult) {
161+
(FFGpuDriverResult){
161162
.temp = options->temp ? &gpu->temperature : NULL,
162163
.memory = options->driverSpecific ? &gpu->dedicated : NULL,
163-
.coreCount = options->driverSpecific ? (uint32_t*) &gpu->coreCount : NULL,
164+
.coreCount = options->driverSpecific ? (uint32_t *)&gpu->coreCount : NULL,
164165
.type = &gpu->type,
165166
.frequency = options->driverSpecific ? &gpu->frequency : NULL,
167+
.coreUtilizationRate = &gpu->coreUtilizationRate,
166168
},
167-
dllName
168-
);
169+
dllName);
169170
}
170171
}
171172
}

src/detection/gpu/gpu_wsl.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ const char* ffGPUDetectByDirectX(FF_MAYBE_UNUSED const FFGPUOptions* options, FF
7373
gpu->coreCount = FF_GPU_CORE_COUNT_UNSET;
7474
gpu->temperature = FF_GPU_TEMP_UNSET;
7575
gpu->frequency = FF_GPU_FREQUENCY_UNSET;
76+
gpu->coreUtilizationRate = FF_GPU_CORE_UTILIZATION_RATE_UNSET;
7677
ffStrbufInitStatic(&gpu->platformApi, "DXCore");
7778

7879
ffStrbufInit(&gpu->driver);
@@ -115,13 +116,14 @@ const char* ffGPUDetectByDirectX(FF_MAYBE_UNUSED const FFGPUOptions* options, FF
115116
.revId = hardwareId.revision,
116117
},
117118
};
118-
ffDetectNvidiaGpuInfo(&cond, (FFGpuDriverResult) {
119-
.temp = options->temp ? &gpu->temperature : NULL,
120-
.memory = options->driverSpecific ? &gpu->dedicated : NULL,
121-
.coreCount = options->driverSpecific ? (uint32_t*) &gpu->coreCount : NULL,
122-
.type = &gpu->type,
123-
.frequency = options->driverSpecific ? &gpu->frequency : NULL,
124-
}, "/usr/lib/wsl/lib/libnvidia-ml.so");
119+
ffDetectNvidiaGpuInfo(&cond, (FFGpuDriverResult){
120+
.temp = options->temp ? &gpu->temperature : NULL,
121+
.memory = options->driverSpecific ? &gpu->dedicated : NULL,
122+
.coreCount = options->driverSpecific ? (uint32_t *)&gpu->coreCount : NULL,
123+
.type = &gpu->type,
124+
.frequency = options->driverSpecific ? &gpu->frequency : NULL,
125+
.coreUtilizationRate = &gpu->coreUtilizationRate,
126+
}, "/usr/lib/wsl/lib/libnvidia-ml.so");
125127
}
126128
}
127129
}

src/detection/gpu/nvml.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
// https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceStructs.html
88
#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32
99
#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16
10+
#define NVML_DEVICE_UUID_V2_BUFFER_SIZE 96
1011

1112
typedef enum { NVML_SUCCESS = 0 } nvmlReturn_t;
1213
typedef struct nvmlDevice_t* nvmlDevice_t;
@@ -55,6 +56,19 @@ typedef struct {
5556
// https://github.com/NVIDIA/nvidia-settings/issues/78#issuecomment-1012837988
5657
enum { nvmlMemory_v2 = (unsigned int)(sizeof(nvmlMemory_v2_t) | (2 << 24U)) };
5758

59+
60+
// https://docs.nvidia.com/deploy/nvml-api/structnvmlMemory__t.html#structnvmlMemory__t
61+
// Memory allocation information for a device (v1)
62+
typedef struct
63+
{
64+
// Total physical device memory (in bytes)
65+
unsigned long long total;
66+
// Unallocated device memory (in bytes)
67+
unsigned long long free;
68+
// Sum of Reserved and Allocated device memory (in bytes)
69+
unsigned long long used;
70+
} nvmlMemory_t;
71+
5872
// https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g805c0647be9996589fc5e3f6ff680c64
5973
// Clock types
6074
typedef enum {
@@ -94,6 +108,16 @@ typedef enum {
94108
NVML_BRAND_COUNT,
95109
} nvmlBrandType_t;
96110

111+
// https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t
112+
// Utilization information for a device.
113+
typedef struct
114+
{
115+
// Percent of time over the past second during which one or more kernels was executing on the GPU
116+
unsigned int gpu;
117+
// Percent of time over the past second during which global (device) memory was being read or written
118+
unsigned int memory;
119+
} nvmlUtilization_t;
120+
97121
// https://docs.nvidia.com/deploy/nvml-api/group__nvmlInitializationAndCleanup.html#group__nvmlInitializationAndCleanup
98122
// Initialize NVML, but don't initialize any GPUs yet
99123
nvmlReturn_t nvmlInit_v2(void);
@@ -113,9 +137,15 @@ extern nvmlReturn_t nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t*
113137
extern nvmlReturn_t nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int* temp);
114138
// Retrieves the amount of used, free, reserved and total memory available on the device, in bytes. The reserved amount is supported on version 2 only
115139
extern nvmlReturn_t nvmlDeviceGetMemoryInfo_v2(nvmlDevice_t device, nvmlMemory_v2_t* memory);
140+
// Retrieves the amount of used, free, total memory available on the device, in bytes.
141+
extern nvmlReturn_t nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory);
116142
// Gets the device's core count
117143
extern nvmlReturn_t nvmlDeviceGetNumGpuCores(nvmlDevice_t device, unsigned int* numCores);
118144
// Retrieves the maximum clock speeds for the device
119145
extern nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int* clock);
120146
// Retrieves the brand of this device
121147
extern nvmlReturn_t nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t* type);
148+
// Retrieves the current utilization rates for the device
149+
extern nvmlReturn_t nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization);
150+
// Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string, that augments the immutable, board serial identifier.
151+
extern nvmlReturn_t nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length);

src/detection/vulkan/vulkan.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ static const char* detectVulkan(FFVulkanResult* result)
237237
gpu->coreCount = FF_GPU_CORE_COUNT_UNSET;
238238
gpu->temperature = FF_GPU_TEMP_UNSET;
239239
gpu->frequency = FF_GPU_FREQUENCY_UNSET;
240+
gpu->coreUtilizationRate = FF_GPU_CORE_UTILIZATION_RATE_UNSET;
240241

241242
next:
242243
continue;

0 commit comments

Comments
 (0)