Skip to content

Commit 0871d70

Browse files
Cleanup: improve comments, reduce test duplication, fix copyright year
Signed-off-by: Krystian Bednarczuk <krystian@cast.ai>
1 parent b0872f9 commit 0871d70

File tree

10 files changed

+80
-71
lines changed

10 files changed

+80
-71
lines changed

internal/pkg/nvmlprovider/provider.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -217,9 +217,8 @@ func (n nvmlProvider) GetDeviceProcessUtilization(gpuUUID string) (map[uint32]ui
217217
return result, nil
218218
}
219219

220-
// GetAllMIGDevicesProcessMemory returns memory usage for all MIG devices on a parent GPU.
221-
// Returns a map from GPU Instance ID to (PID -> memory used in bytes).
222-
// Note: Only memory info is available for MIG devices, not SM utilization.
220+
// GetAllMIGDevicesProcessMemory returns per-process memory usage for all MIG instances on a GPU.
221+
// Returns map[gpuInstanceID (MIG instance)]map[PID]memoryBytes.
223222
func (n nvmlProvider) GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[uint]map[uint32]uint64, error) {
224223
if err := n.preCheck(); err != nil {
225224
return nil, fmt.Errorf("failed to get MIG device process memory: %w", err)

internal/pkg/nvmlprovider/provider_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,30 @@ func TestGetMIGDeviceInfoByID_When_NVML_Not_Initialized(t *testing.T) {
3030
assert.Error(t, err, "uuid: %v, Device Info: %+v", validMIGUUID, deviceInfo)
3131
}
3232

33+
func TestGetDeviceProcessMemory_When_NVML_Not_Initialized(t *testing.T) {
34+
provider := nvmlProvider{}
35+
result, err := provider.GetDeviceProcessMemory("GPU-test-uuid")
36+
assert.Error(t, err)
37+
assert.Nil(t, result)
38+
assert.Contains(t, err.Error(), "failed to get device process memory")
39+
}
40+
41+
func TestGetDeviceProcessUtilization_When_NVML_Not_Initialized(t *testing.T) {
42+
provider := nvmlProvider{}
43+
result, err := provider.GetDeviceProcessUtilization("GPU-test-uuid")
44+
assert.Error(t, err)
45+
assert.Nil(t, result)
46+
assert.Contains(t, err.Error(), "failed to get device process utilization")
47+
}
48+
49+
func TestGetAllMIGDevicesProcessMemory_When_NVML_Not_Initialized(t *testing.T) {
50+
provider := nvmlProvider{}
51+
result, err := provider.GetAllMIGDevicesProcessMemory("GPU-test-uuid")
52+
assert.Error(t, err)
53+
assert.Nil(t, result)
54+
assert.Contains(t, err.Error(), "failed to get MIG device process memory")
55+
}
56+
3357
func TestGetMIGDeviceInfoByID_When_DriverVersion_Below_R470(t *testing.T) {
3458
Initialize()
3559
assert.NotNil(t, Client(), "expected NVML Client to be not nil")

internal/pkg/nvmlprovider/types.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
package nvmlprovider
2020

21-
// NVML interface provides access to NVIDIA Management Library functionality
2221
type NVML interface {
2322
GetMIGDeviceInfoByID(string) (*MIGDeviceInfo, error)
2423
// GetDeviceProcessMemory returns memory usage for processes running on the GPU.
@@ -27,9 +26,8 @@ type NVML interface {
2726
// GetDeviceProcessUtilization returns SM utilization for processes running on the GPU.
2827
// Returns a map from PID to SM utilization percentage.
2928
GetDeviceProcessUtilization(gpuUUID string) (map[uint32]uint32, error)
30-
// GetAllMIGDevicesProcessMemory returns memory usage for all MIG devices on a parent GPU.
31-
// Returns a map from GPU Instance ID to (PID -> memory used in bytes).
32-
// Note: Only memory info is available for MIG devices, not SM utilization.
29+
// GetAllMIGDevicesProcessMemory returns per-process memory usage for all MIG instances on a GPU.
30+
// Returns map[gpuInstanceID (MIG instance)]map[PID]memoryBytes.
3331
GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[uint]map[uint32]uint64, error)
3432
Cleanup()
3533
}

internal/pkg/transformation/const.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package transformation
1818

1919
const (
20+
// Note standard resource attributes
2021
podAttribute = "pod"
2122
namespaceAttribute = "namespace"
2223
containerAttribute = "container"
@@ -39,7 +40,6 @@ const (
3940

4041
DRAGPUDriverName = "gpu.nvidia.com"
4142

42-
metricGPUUtil = "DCGM_FI_DEV_GPU_UTIL"
43-
metricFBUsed = "DCGM_FI_DEV_FB_USED"
44-
metricGREngineActive = "DCGM_FI_PROF_GR_ENGINE_ACTIVE"
43+
metricGPUUtil = "DCGM_FI_DEV_GPU_UTIL"
44+
metricFBUsed = "DCGM_FI_DEV_FB_USED"
4545
)

internal/pkg/transformation/kubernetes_test.go

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,9 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
347347
ctrl := gomock.NewController(t)
348348
mockNVMLProvider := mocknvmlprovider.NewMockNVML(ctrl)
349349
mockNVMLProvider.EXPECT().GetMIGDeviceInfoByID(gomock.Any()).Return(migDeviceInfo, nil).AnyTimes()
350+
mockNVMLProvider.EXPECT().GetDeviceProcessMemory(gomock.Any()).Return(map[uint32]uint64{}, nil).AnyTimes()
351+
mockNVMLProvider.EXPECT().GetDeviceProcessUtilization(gomock.Any()).Return(map[uint32]uint32{}, nil).AnyTimes()
352+
mockNVMLProvider.EXPECT().GetAllMIGDevicesProcessMemory(gomock.Any()).Return(map[uint]map[uint32]uint64{}, nil).AnyTimes()
350353
nvmlprovider.SetClient(mockNVMLProvider)
351354

352355
podMapper := NewPodMapper(&appconfig.Config{
@@ -946,6 +949,7 @@ func TestProcessPodMapper_WithLabelsAndUID(t *testing.T) {
946949
}
947950

948951
func TestBuildPodValueMap(t *testing.T) {
952+
t.Parallel()
949953
tests := []struct {
950954
name string
951955
pidToPod map[uint32]*PodInfo
@@ -960,6 +964,15 @@ func TestBuildPodValueMap(t *testing.T) {
960964
fieldName: metricGPUUtil,
961965
expected: map[string]string{},
962966
},
967+
{
968+
name: "empty pidToPod returns empty map",
969+
pidToPod: map[uint32]*PodInfo{},
970+
data: &perProcessMetrics{
971+
pidToSMUtil: map[uint32]uint32{1001: 50},
972+
},
973+
fieldName: metricGPUUtil,
974+
expected: map[string]string{},
975+
},
963976
{
964977
name: "maps PID values to pod UIDs for GPU util",
965978
pidToPod: map[uint32]*PodInfo{1001: {UID: "uid1"}, 1002: {UID: "uid2"}},
@@ -991,13 +1004,15 @@ func TestBuildPodValueMap(t *testing.T) {
9911004

9921005
for _, tc := range tests {
9931006
t.Run(tc.name, func(t *testing.T) {
1007+
t.Parallel()
9941008
result := buildPodValueMap(tc.pidToPod, tc.data, tc.fieldName)
9951009
assert.Equal(t, tc.expected, result)
9961010
})
9971011
}
9981012
}
9991013

10001014
func TestBuildIdlePodValues(t *testing.T) {
1015+
t.Parallel()
10011016
tests := []struct {
10021017
name string
10031018
existingValues map[string]string
@@ -1026,13 +1041,15 @@ func TestBuildIdlePodValues(t *testing.T) {
10261041

10271042
for _, tc := range tests {
10281043
t.Run(tc.name, func(t *testing.T) {
1044+
t.Parallel()
10291045
result := buildIdlePodValues(tc.existingValues, tc.devicePods)
10301046
assert.Equal(t, tc.expected, result)
10311047
})
10321048
}
10331049
}
10341050

10351051
func TestPodMapper_CreatePerProcessMetrics(t *testing.T) {
1052+
t.Parallel()
10361053
gpuUUID := "GPU-00000000-0000-0000-0000-000000000000"
10371054
podUID := "a9c80282-3f6b-4d5b-84d5-a137a6668011"
10381055

@@ -1226,6 +1243,7 @@ func TestPodMapper_CreatePerProcessMetrics(t *testing.T) {
12261243

12271244
for _, tc := range tests {
12281245
t.Run(tc.name, func(t *testing.T) {
1246+
t.Parallel()
12291247
podMapper := &PodMapper{
12301248
Config: &appconfig.Config{
12311249
UseOldNamespace: tc.useOldNS,
@@ -1245,21 +1263,17 @@ func TestPodMapper_CreatePerProcessMetrics(t *testing.T) {
12451263
}
12461264

12471265
func TestStripVGPUSuffix(t *testing.T) {
1266+
t.Parallel()
12481267
tests := []struct {
12491268
name string
12501269
deviceID string
12511270
expected string
12521271
}{
12531272
{
1254-
name: "AWS MIG device ID with vgpu suffix",
1273+
name: "MIG device ID with vgpu suffix",
12551274
deviceID: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55::7",
12561275
expected: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55",
12571276
},
1258-
{
1259-
name: "AWS MIG device ID with different vgpu index",
1260-
deviceID: "MIG-a8d7e63b-588b-5fd8-826d-d1eab19c6f18::9",
1261-
expected: "MIG-a8d7e63b-588b-5fd8-826d-d1eab19c6f18",
1262-
},
12631277
{
12641278
name: "Plain MIG UUID without suffix",
12651279
deviceID: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55",
@@ -1289,6 +1303,7 @@ func TestStripVGPUSuffix(t *testing.T) {
12891303

12901304
for _, tc := range tests {
12911305
t.Run(tc.name, func(t *testing.T) {
1306+
t.Parallel()
12921307
result := stripVGPUSuffix(tc.deviceID)
12931308
assert.Equal(t, tc.expected, result)
12941309
})

internal/pkg/transformation/pidmapper.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
//go:build linux
2-
31
/*
4-
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
53
*
64
* Licensed under the Apache License, Version 2.0 (the "License");
75
* you may not use this file except in compliance with the License.

internal/pkg/transformation/pidmapper_stub.go

Lines changed: 0 additions & 35 deletions
This file was deleted.

internal/pkg/transformation/pidmapper_test.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
//go:build linux
2-
31
/*
4-
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
53
*
64
* Licensed under the Apache License, Version 2.0 (the "License");
75
* you may not use this file except in compliance with the License.
@@ -25,6 +23,7 @@ import (
2523
)
2624

2725
func TestExtractPodUID(t *testing.T) {
26+
t.Parallel()
2827
tests := []struct {
2928
name string
3029
path string
@@ -64,13 +63,15 @@ func TestExtractPodUID(t *testing.T) {
6463

6564
for _, tc := range tests {
6665
t.Run(tc.name, func(t *testing.T) {
66+
t.Parallel()
6767
result := extractPodUID(tc.path)
6868
assert.Equal(t, tc.expected, result)
6969
})
7070
}
7171
}
7272

7373
func TestExtractPodUIDFromPaths(t *testing.T) {
74+
t.Parallel()
7475
tests := []struct {
7576
name string
7677
subsystems map[string]string
@@ -104,13 +105,15 @@ func TestExtractPodUIDFromPaths(t *testing.T) {
104105

105106
for _, tc := range tests {
106107
t.Run(tc.name, func(t *testing.T) {
108+
t.Parallel()
107109
result := extractPodUIDFromPaths(tc.subsystems, tc.unified)
108110
assert.Equal(t, tc.expected, result)
109111
})
110112
}
111113
}
112114

113115
func TestBuildPIDToPodMap(t *testing.T) {
116+
t.Parallel()
114117
mapper := newPIDToPodMapper()
115118

116119
pods := []PodInfo{

internal/pkg/transformation/process_metrics.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ import (
2828
)
2929

3030
func isPerProcessMetric(fieldName string) bool {
31-
return fieldName == metricGPUUtil || fieldName == metricFBUsed || fieldName == metricGREngineActive
31+
return fieldName == metricGPUUtil || fieldName == metricFBUsed
3232
}
3333

3434
// getGPUUUIDToDeviceID builds a mapping from GPU UUID to device ID based on the specified ID type.

0 commit comments

Comments
 (0)