Skip to content

Commit 9ad3cd0

Browse files
committed
fix: GPU ID collision between Intel and NVIDIA collectors (#1522)
- Prefix Intel GPU ID as i0 to avoid NVML/NVIDIA index IDs like 0 - Update frontend GPU engines chart to select a GPU by id instead of assuming g[0] - Adjust tests to use the new Intel GPU id
1 parent 00def27 commit 9ad3cd0

File tree

4 files changed

+38
-30
lines changed

4 files changed

+38
-30
lines changed

agent/gpu.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,11 @@ func (gm *GPUManager) parseAmdData(output []byte) bool {
237237
totalMemory, _ := strconv.ParseFloat(v.MemoryTotal, 64)
238238
usage, _ := strconv.ParseFloat(v.Usage, 64)
239239

240-
if _, ok := gm.GpuDataMap[v.ID]; !ok {
241-
gm.GpuDataMap[v.ID] = &system.GPUData{Name: v.Name}
240+
id := v.ID
241+
if _, ok := gm.GpuDataMap[id]; !ok {
242+
gm.GpuDataMap[id] = &system.GPUData{Name: v.Name}
242243
}
243-
gpu := gm.GpuDataMap[v.ID]
244+
gpu := gm.GpuDataMap[id]
244245
gpu.Temperature, _ = strconv.ParseFloat(v.Temperature, 64)
245246
gpu.MemoryUsed = bytesToMegabytes(memoryUsage)
246247
gpu.MemoryTotal = bytesToMegabytes(totalMemory)

agent/gpu_intel.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,11 @@ func (gm *GPUManager) updateIntelFromStats(sample *intelGpuStats) bool {
2727
defer gm.Unlock()
2828

2929
// only one gpu for now - cmd doesn't provide all by default
30-
gpuData, ok := gm.GpuDataMap["0"]
30+
id := "i0" // prefix with i to avoid conflicts with nvidia card ids
31+
gpuData, ok := gm.GpuDataMap[id]
3132
if !ok {
3233
gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64)}
33-
gm.GpuDataMap["0"] = gpuData
34+
gm.GpuDataMap[id] = gpuData
3435
}
3536

3637
gpuData.Power += sample.PowerGPU

agent/gpu_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1385,7 +1385,7 @@ func TestIntelUpdateFromStats(t *testing.T) {
13851385
ok := gm.updateIntelFromStats(&sample1)
13861386
assert.True(t, ok)
13871387

1388-
gpu := gm.GpuDataMap["0"]
1388+
gpu := gm.GpuDataMap["i0"]
13891389
require.NotNil(t, gpu)
13901390
assert.Equal(t, "GPU", gpu.Name)
13911391
assert.EqualValues(t, 10.5, gpu.Power)
@@ -1407,7 +1407,7 @@ func TestIntelUpdateFromStats(t *testing.T) {
14071407
ok = gm.updateIntelFromStats(&sample2)
14081408
assert.True(t, ok)
14091409

1410-
gpu = gm.GpuDataMap["0"]
1410+
gpu = gm.GpuDataMap["i0"]
14111411
require.NotNil(t, gpu)
14121412
assert.EqualValues(t, 10.5, gpu.Power)
14131413
assert.EqualValues(t, 30.0, gpu.Engines["Render/3D"]) // 20 + 10
@@ -1446,7 +1446,7 @@ echo "298 295 278 51 2.20 3.12 1675 942 5.75 1 2 9.50
14461446
t.Fatalf("collectIntelStats error: %v", err)
14471447
}
14481448

1449-
gpu := gm.GpuDataMap["0"]
1449+
gpu := gm.GpuDataMap["i0"]
14501450
require.NotNil(t, gpu)
14511451
// Power should be sum of samples 2-4 (first is skipped): 2.0 + 1.8 + 2.2 = 6.0
14521452
assert.EqualValues(t, 6.0, gpu.Power)

internal/site/src/components/routes/system.tsx

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -409,26 +409,18 @@ export default memo(function SystemDetail({ id }: { id: string }) {
409409
if (lastGpus) {
410410
// check if there are any GPUs at all
411411
hasGpuData = Object.keys(lastGpus).length > 0
412-
// check if there are any GPUs with engines
413-
for (let i = 0; i < systemStats.length && !hasGpuEnginesData; i++) {
412+
// check if there are any GPUs with engines or power data
413+
for (let i = 0; i < systemStats.length && (!hasGpuEnginesData || !hasGpuPowerData); i++) {
414414
const gpus = systemStats[i].stats?.g
415415
if (!gpus) continue
416416
for (const id in gpus) {
417-
if (gpus[id].e !== undefined) {
417+
if (!hasGpuEnginesData && gpus[id].e !== undefined) {
418418
hasGpuEnginesData = true
419-
break
420419
}
421-
}
422-
}
423-
// check if there are any GPUs with power data
424-
for (let i = 0; i < systemStats.length && !hasGpuPowerData; i++) {
425-
const gpus = systemStats[i].stats?.g
426-
if (!gpus) continue
427-
for (const id in gpus) {
428-
if (gpus[id].p !== undefined || gpus[id].pp !== undefined) {
420+
if (!hasGpuPowerData && (gpus[id].p !== undefined || gpus[id].pp !== undefined)) {
429421
hasGpuPowerData = true
430-
break
431422
}
423+
if (hasGpuEnginesData && hasGpuPowerData) break
432424
}
433425
}
434426
}
@@ -896,16 +888,30 @@ export default memo(function SystemDetail({ id }: { id: string }) {
896888
})
897889

898890
function GpuEnginesChart({ chartData }: { chartData: ChartData }) {
899-
const dataPoints: DataPoint[] = []
900-
const engines = Object.keys(chartData.systemStats?.at(-1)?.stats.g?.[0]?.e ?? {}).sort()
901-
for (const engine of engines) {
902-
dataPoints.push({
903-
label: engine,
904-
dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[0]?.e?.[engine] ?? 0,
905-
color: `hsl(${140 + (((engines.indexOf(engine) * 360) / engines.length) % 360)}, 65%, 52%)`,
906-
opacity: 0.35,
907-
})
891+
const { gpuId, engines } = useMemo(() => {
892+
for (let i = chartData.systemStats.length - 1; i >= 0; i--) {
893+
const gpus = chartData.systemStats[i].stats?.g
894+
if (!gpus) continue
895+
for (const id in gpus) {
896+
if (gpus[id].e) {
897+
return { gpuId: id, engines: Object.keys(gpus[id].e).sort() }
898+
}
899+
}
900+
}
901+
return { gpuId: null, engines: [] }
902+
}, [chartData.systemStats])
903+
904+
if (!gpuId) {
905+
return null
908906
}
907+
908+
const dataPoints: DataPoint[] = engines.map((engine, i) => ({
909+
label: engine,
910+
dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[gpuId]?.e?.[engine] ?? 0,
911+
color: `hsl(${140 + (((i * 360) / engines.length) % 360)}, 65%, 52%)`,
912+
opacity: 0.35,
913+
}))
914+
909915
return (
910916
<LineChartDefault
911917
legend={true}

0 commit comments

Comments
 (0)