Skip to content

Commit e1102f5

Browse files
Cleanup: improve comments, reduce test duplication, fix copyright year
Signed-off-by: Krystian Bednarczuk <krystian@cast.ai>
1 parent b0872f9 commit e1102f5

File tree

11 files changed

+80
-71
lines changed

11 files changed

+80
-71
lines changed

internal/pkg/nvmlprovider/provider.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -217,9 +217,8 @@ func (n nvmlProvider) GetDeviceProcessUtilization(gpuUUID string) (map[uint32]ui
217217
return result, nil
218218
}
219219

220-
// GetAllMIGDevicesProcessMemory returns memory usage for all MIG devices on a parent GPU.
221-
// Returns a map from GPU Instance ID to (PID -> memory used in bytes).
222-
// Note: Only memory info is available for MIG devices, not SM utilization.
220+
// GetAllMIGDevicesProcessMemory returns per-process memory usage for all MIG instances on a GPU.
221+
// Returns map[gpuInstanceID (MIG instance)]map[PID]memoryBytes.
223222
func (n nvmlProvider) GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[uint]map[uint32]uint64, error) {
224223
if err := n.preCheck(); err != nil {
225224
return nil, fmt.Errorf("failed to get MIG device process memory: %w", err)

internal/pkg/nvmlprovider/provider_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,30 @@ func TestGetMIGDeviceInfoByID_When_NVML_Not_Initialized(t *testing.T) {
3030
assert.Error(t, err, "uuid: %v, Device Info: %+v", validMIGUUID, deviceInfo)
3131
}
3232

33+
func TestGetDeviceProcessMemory_When_NVML_Not_Initialized(t *testing.T) {
34+
provider := nvmlProvider{}
35+
result, err := provider.GetDeviceProcessMemory("GPU-test-uuid")
36+
assert.Error(t, err)
37+
assert.Nil(t, result)
38+
assert.Contains(t, err.Error(), "failed to get device process memory")
39+
}
40+
41+
func TestGetDeviceProcessUtilization_When_NVML_Not_Initialized(t *testing.T) {
42+
provider := nvmlProvider{}
43+
result, err := provider.GetDeviceProcessUtilization("GPU-test-uuid")
44+
assert.Error(t, err)
45+
assert.Nil(t, result)
46+
assert.Contains(t, err.Error(), "failed to get device process utilization")
47+
}
48+
49+
func TestGetAllMIGDevicesProcessMemory_When_NVML_Not_Initialized(t *testing.T) {
50+
provider := nvmlProvider{}
51+
result, err := provider.GetAllMIGDevicesProcessMemory("GPU-test-uuid")
52+
assert.Error(t, err)
53+
assert.Nil(t, result)
54+
assert.Contains(t, err.Error(), "failed to get MIG device process memory")
55+
}
56+
3357
func TestGetMIGDeviceInfoByID_When_DriverVersion_Below_R470(t *testing.T) {
3458
Initialize()
3559
assert.NotNil(t, Client(), "expected NVML Client to be not nil")

internal/pkg/nvmlprovider/types.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
package nvmlprovider
2020

21-
// NVML interface provides access to NVIDIA Management Library functionality
2221
type NVML interface {
2322
GetMIGDeviceInfoByID(string) (*MIGDeviceInfo, error)
2423
// GetDeviceProcessMemory returns memory usage for processes running on the GPU.
@@ -27,9 +26,8 @@ type NVML interface {
2726
// GetDeviceProcessUtilization returns SM utilization for processes running on the GPU.
2827
// Returns a map from PID to SM utilization percentage.
2928
GetDeviceProcessUtilization(gpuUUID string) (map[uint32]uint32, error)
30-
// GetAllMIGDevicesProcessMemory returns memory usage for all MIG devices on a parent GPU.
31-
// Returns a map from GPU Instance ID to (PID -> memory used in bytes).
32-
// Note: Only memory info is available for MIG devices, not SM utilization.
29+
// GetAllMIGDevicesProcessMemory returns per-process memory usage for all MIG instances on a GPU.
30+
// Returns map[gpuInstanceID (MIG instance)]map[PID]memoryBytes.
3331
GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[uint]map[uint32]uint64, error)
3432
Cleanup()
3533
}

internal/pkg/transformation/const.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package transformation
1818

1919
const (
20+
// Note standard resource attributes
2021
podAttribute = "pod"
2122
namespaceAttribute = "namespace"
2223
containerAttribute = "container"
@@ -39,7 +40,6 @@ const (
3940

4041
DRAGPUDriverName = "gpu.nvidia.com"
4142

42-
metricGPUUtil = "DCGM_FI_DEV_GPU_UTIL"
43-
metricFBUsed = "DCGM_FI_DEV_FB_USED"
44-
metricGREngineActive = "DCGM_FI_PROF_GR_ENGINE_ACTIVE"
43+
metricGPUUtil = "DCGM_FI_DEV_GPU_UTIL"
44+
metricFBUsed = "DCGM_FI_DEV_FB_USED"
4545
)

internal/pkg/transformation/kubernetes.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,9 @@ func (p *PodMapper) Process(metrics collector.MetricsByCounter, deviceInfo devic
356356
// with the container info and the shared GPU label, if it exists.
357357
// Notably, this will increase the number of unique metrics (i.e. labelsets)
358358
// to by the number of containers sharing the GPU.
359+
if len(podInfos) > 0 {
360+
newmetrics = append(newmetrics, metrics[counter][j]) // original device-level metric
361+
}
359362
for _, pi := range podInfos {
360363
metric, err := utils.DeepCopy(metrics[counter][j])
361364
if err != nil {

internal/pkg/transformation/kubernetes_test.go

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -946,6 +946,7 @@ func TestProcessPodMapper_WithLabelsAndUID(t *testing.T) {
946946
}
947947

948948
func TestBuildPodValueMap(t *testing.T) {
949+
t.Parallel()
949950
tests := []struct {
950951
name string
951952
pidToPod map[uint32]*PodInfo
@@ -960,6 +961,15 @@ func TestBuildPodValueMap(t *testing.T) {
960961
fieldName: metricGPUUtil,
961962
expected: map[string]string{},
962963
},
964+
{
965+
name: "empty pidToPod returns empty map",
966+
pidToPod: map[uint32]*PodInfo{},
967+
data: &perProcessMetrics{
968+
pidToSMUtil: map[uint32]uint32{1001: 50},
969+
},
970+
fieldName: metricGPUUtil,
971+
expected: map[string]string{},
972+
},
963973
{
964974
name: "maps PID values to pod UIDs for GPU util",
965975
pidToPod: map[uint32]*PodInfo{1001: {UID: "uid1"}, 1002: {UID: "uid2"}},
@@ -991,13 +1001,15 @@ func TestBuildPodValueMap(t *testing.T) {
9911001

9921002
for _, tc := range tests {
9931003
t.Run(tc.name, func(t *testing.T) {
1004+
t.Parallel()
9941005
result := buildPodValueMap(tc.pidToPod, tc.data, tc.fieldName)
9951006
assert.Equal(t, tc.expected, result)
9961007
})
9971008
}
9981009
}
9991010

10001011
func TestBuildIdlePodValues(t *testing.T) {
1012+
t.Parallel()
10011013
tests := []struct {
10021014
name string
10031015
existingValues map[string]string
@@ -1026,13 +1038,15 @@ func TestBuildIdlePodValues(t *testing.T) {
10261038

10271039
for _, tc := range tests {
10281040
t.Run(tc.name, func(t *testing.T) {
1041+
t.Parallel()
10291042
result := buildIdlePodValues(tc.existingValues, tc.devicePods)
10301043
assert.Equal(t, tc.expected, result)
10311044
})
10321045
}
10331046
}
10341047

10351048
func TestPodMapper_CreatePerProcessMetrics(t *testing.T) {
1049+
t.Parallel()
10361050
gpuUUID := "GPU-00000000-0000-0000-0000-000000000000"
10371051
podUID := "a9c80282-3f6b-4d5b-84d5-a137a6668011"
10381052

@@ -1226,6 +1240,7 @@ func TestPodMapper_CreatePerProcessMetrics(t *testing.T) {
12261240

12271241
for _, tc := range tests {
12281242
t.Run(tc.name, func(t *testing.T) {
1243+
t.Parallel()
12291244
podMapper := &PodMapper{
12301245
Config: &appconfig.Config{
12311246
UseOldNamespace: tc.useOldNS,
@@ -1245,21 +1260,17 @@ func TestPodMapper_CreatePerProcessMetrics(t *testing.T) {
12451260
}
12461261

12471262
func TestStripVGPUSuffix(t *testing.T) {
1263+
t.Parallel()
12481264
tests := []struct {
12491265
name string
12501266
deviceID string
12511267
expected string
12521268
}{
12531269
{
1254-
name: "AWS MIG device ID with vgpu suffix",
1270+
name: "MIG device ID with vgpu suffix",
12551271
deviceID: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55::7",
12561272
expected: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55",
12571273
},
1258-
{
1259-
name: "AWS MIG device ID with different vgpu index",
1260-
deviceID: "MIG-a8d7e63b-588b-5fd8-826d-d1eab19c6f18::9",
1261-
expected: "MIG-a8d7e63b-588b-5fd8-826d-d1eab19c6f18",
1262-
},
12631274
{
12641275
name: "Plain MIG UUID without suffix",
12651276
deviceID: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55",
@@ -1289,6 +1300,7 @@ func TestStripVGPUSuffix(t *testing.T) {
12891300

12901301
for _, tc := range tests {
12911302
t.Run(tc.name, func(t *testing.T) {
1303+
t.Parallel()
12921304
result := stripVGPUSuffix(tc.deviceID)
12931305
assert.Equal(t, tc.expected, result)
12941306
})

internal/pkg/transformation/pidmapper.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
//go:build linux
2-
31
/*
4-
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
53
*
64
* Licensed under the Apache License, Version 2.0 (the "License");
75
* you may not use this file except in compliance with the License.

internal/pkg/transformation/pidmapper_stub.go

Lines changed: 0 additions & 35 deletions
This file was deleted.

internal/pkg/transformation/pidmapper_test.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
//go:build linux
2-
31
/*
4-
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
53
*
64
* Licensed under the Apache License, Version 2.0 (the "License");
75
* you may not use this file except in compliance with the License.
@@ -25,6 +23,7 @@ import (
2523
)
2624

2725
func TestExtractPodUID(t *testing.T) {
26+
t.Parallel()
2827
tests := []struct {
2928
name string
3029
path string
@@ -64,13 +63,15 @@ func TestExtractPodUID(t *testing.T) {
6463

6564
for _, tc := range tests {
6665
t.Run(tc.name, func(t *testing.T) {
66+
t.Parallel()
6767
result := extractPodUID(tc.path)
6868
assert.Equal(t, tc.expected, result)
6969
})
7070
}
7171
}
7272

7373
func TestExtractPodUIDFromPaths(t *testing.T) {
74+
t.Parallel()
7475
tests := []struct {
7576
name string
7677
subsystems map[string]string
@@ -104,13 +105,15 @@ func TestExtractPodUIDFromPaths(t *testing.T) {
104105

105106
for _, tc := range tests {
106107
t.Run(tc.name, func(t *testing.T) {
108+
t.Parallel()
107109
result := extractPodUIDFromPaths(tc.subsystems, tc.unified)
108110
assert.Equal(t, tc.expected, result)
109111
})
110112
}
111113
}
112114

113115
func TestBuildPIDToPodMap(t *testing.T) {
116+
t.Parallel()
114117
mapper := newPIDToPodMapper()
115118

116119
pods := []PodInfo{

internal/pkg/transformation/process_metrics.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ import (
2828
)
2929

3030
func isPerProcessMetric(fieldName string) bool {
31-
return fieldName == metricGPUUtil || fieldName == metricFBUsed || fieldName == metricGREngineActive
31+
return fieldName == metricGPUUtil || fieldName == metricFBUsed
3232
}
3333

3434
// getGPUUUIDToDeviceID builds a mapping from GPU UUID to device ID based on the specified ID type.

0 commit comments

Comments
 (0)