Skip to content

Commit 7c308bb

Browse files
authored
feat: add node label for dcgm exporter (#7447)
1 parent 196a0c8 commit 7c308bb

File tree

66 files changed

+300
-181
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+300
-181
lines changed

e2e/scenario_gpu_managed_experience_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning(t *testing.T) {
161161
ValidateNvidiaDCGMExporterSystemDServiceRunning(ctx, s)
162162
ValidateNvidiaDCGMExporterIsScrapable(ctx, s)
163163
ValidateNvidiaDCGMExporterScrapeCommonMetric(ctx, s, "DCGM_FI_DEV_GPU_UTIL")
164+
ValidateNodeHasLabel(ctx, s, "kubernetes.azure.com/dcgm-exporter", "enabled")
164165

165166
// Let's run the NPD validation tests to verify that the nvidia
166167
// device plugin & DCGM services are reporting status correctly
@@ -234,6 +235,7 @@ func Test_Ubuntu2204_NvidiaDevicePluginRunning(t *testing.T) {
234235
ValidateNvidiaDCGMExporterSystemDServiceRunning(ctx, s)
235236
ValidateNvidiaDCGMExporterIsScrapable(ctx, s)
236237
ValidateNvidiaDCGMExporterScrapeCommonMetric(ctx, s, "DCGM_FI_DEV_GPU_UTIL")
238+
ValidateNodeHasLabel(ctx, s, "kubernetes.azure.com/dcgm-exporter", "enabled")
237239

238240
// Let's run the NPD validation tests to verify that the nvidia
239241
// device plugin & DCGM services are reporting status correctly
@@ -307,6 +309,7 @@ func Test_AzureLinux3_NvidiaDevicePluginRunning(t *testing.T) {
307309
ValidateNvidiaDCGMExporterSystemDServiceRunning(ctx, s)
308310
ValidateNvidiaDCGMExporterIsScrapable(ctx, s)
309311
ValidateNvidiaDCGMExporterScrapeCommonMetric(ctx, s, "DCGM_FI_DEV_GPU_UTIL")
312+
ValidateNodeHasLabel(ctx, s, "kubernetes.azure.com/dcgm-exporter", "enabled")
310313

311314
// Let's run the NPD validation tests to verify that the nvidia
312315
// device plugin & DCGM services are reporting status correctly
@@ -385,6 +388,7 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning_MIG(t *testing.T) {
385388
ValidateNvidiaDCGMExporterSystemDServiceRunning(ctx, s)
386389
ValidateNvidiaDCGMExporterIsScrapable(ctx, s)
387390
ValidateNvidiaDCGMExporterScrapeCommonMetric(ctx, s, "DCGM_FI_DEV_GPU_TEMP")
391+
ValidateNodeHasLabel(ctx, s, "kubernetes.azure.com/dcgm-exporter", "enabled")
388392

389393
// Let's run the NPD validation tests to verify that the nvidia
390394
// device plugin & DCGM services are reporting status correctly

e2e/validators.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1638,3 +1638,14 @@ func truncatePodName(t testing.TB, pod *corev1.Pod) {
16381638
pod.Name = strings.TrimRight(pod.Name, "-")
16391639
t.Logf("truncated pod name %q to %q", name, pod.Name)
16401640
}
1641+
1642+
// ValidateNodeHasLabel checks if the node has the expected label with the expected value
1643+
func ValidateNodeHasLabel(ctx context.Context, s *Scenario, labelKey, expectedValue string) {
1644+
s.T.Helper()
1645+
node, err := s.Runtime.Cluster.Kube.Typed.CoreV1().Nodes().Get(ctx, s.Runtime.VM.KubeName, metav1.GetOptions{})
1646+
require.NoError(s.T, err, "failed to get node %q", s.Runtime.VM.KubeName)
1647+
1648+
actualValue, exists := node.Labels[labelKey]
1649+
require.True(s.T, exists, "expected node %q to have label %q, but it was not found", s.Runtime.VM.KubeName, labelKey)
1650+
require.Equal(s.T, expectedValue, actualValue, "expected node %q label %q to have value %q, but got %q", s.Runtime.VM.KubeName, labelKey, expectedValue, actualValue)
1651+
}

parts/linux/cloud-init/artifacts/cse_config.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,6 +1172,19 @@ EOF
11721172
echo "Enable localdns succeeded."
11731173
}
11741174

1175+
configureManagedGPUExperience() {
1176+
if [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ] && [ "${ENABLE_MANAGED_GPU_EXPERIENCE}" = "true" ]; then
1177+
logs_to_events "AKS.CSE.installNvidiaManagedExpPkgFromCache" "installNvidiaManagedExpPkgFromCache" || exit $ERR_NVIDIA_DCGM_INSTALL
1178+
logs_to_events "AKS.CSE.startNvidiaManagedExpServices" "startNvidiaManagedExpServices" || exit $ERR_NVIDIA_DCGM_EXPORTER_FAIL
1179+
1180+
addKubeletNodeLabel "kubernetes.azure.com/dcgm-exporter=enabled"
1181+
elif [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ] && [ "${ENABLE_MANAGED_GPU_EXPERIENCE}" = "false" ]; then
1182+
logs_to_events "AKS.CSE.stop.nvidia-device-plugin" "systemctlDisableAndStop nvidia-device-plugin"
1183+
logs_to_events "AKS.CSE.stop.nvidia-dcgm" "systemctlDisableAndStop nvidia-dcgm"
1184+
logs_to_events "AKS.CSE.stop.nvidia-dcgm-exporter" "systemctlDisableAndStop nvidia-dcgm-exporter"
1185+
fi
1186+
}
1187+
11751188
startNvidiaManagedExpServices() {
11761189
# 1. Start the nvidia-device-plugin service.
11771190
# Create systemd override directory to configure device plugin

parts/linux/cloud-init/artifacts/cse_helpers.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ ERR_CTR_OPERATION_ERROR=119 # Error executing a ctr containerd cli operation
6969
ERR_INVALID_CLI_TOOL=120 # Invalid CLI tool specified, should be one of ctr, crictl, docker
7070
ERR_KUBELET_INSTALL_FAIL=121 # Error installing kubelet
7171
ERR_KUBECTL_INSTALL_FAIL=122 # Error installing kubectl
72+
ERR_ENABLE_MANAGED_GPU_EXPERIENCE=123 # Error confguring managed GPU experience
7273

7374
# 123 is free for use
7475

parts/linux/cloud-init/artifacts/cse_main.sh

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -400,14 +400,8 @@ function nodePrep {
400400
if [ "$?" -ne 0 ] && [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ]; then
401401
echo "failed to determine if managed GPU experience should be enabled by nodepool tags"
402402
exit $ERR_LOOKUP_ENABLE_MANAGED_GPU_EXPERIENCE_TAG
403-
elif [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ] && [ "${ENABLE_MANAGED_GPU_EXPERIENCE}" = "true" ]; then
404-
logs_to_events "AKS.CSE.installNvidiaManagedExpPkgFromCache" "installNvidiaManagedExpPkgFromCache" || exit $ERR_NVIDIA_DCGM_INSTALL
405-
logs_to_events "AKS.CSE.startNvidiaManagedExpServices" "startNvidiaManagedExpServices" || exit $ERR_NVIDIA_DCGM_EXPORTER_FAIL
406-
elif [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ] && [ "${ENABLE_MANAGED_GPU_EXPERIENCE}" = "false" ]; then
407-
logs_to_events "AKS.CSE.stop.nvidia-device-plugin" "systemctlDisableAndStop nvidia-device-plugin"
408-
logs_to_events "AKS.CSE.stop.nvidia-dcgm" "systemctlDisableAndStop nvidia-dcgm"
409-
logs_to_events "AKS.CSE.stop.nvidia-dcgm-exporter" "systemctlDisableAndStop nvidia-dcgm-exporter"
410403
fi
404+
logs_to_events "AKS.CSE.configureManagedGPUExperience" configureManagedGPUExperience || exit $ERR_ENABLE_MANAGED_GPU_EXPERIENCE
411405

412406
VALIDATION_ERR=0
413407

pkg/agent/testdata/AKSUbuntu2204+China/CustomData

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

pkg/agent/testdata/AKSUbuntu2204+Containerd+CDI/CustomData

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

pkg/agent/testdata/AKSUbuntu2204+Containerd+DevicePlugin/CustomData

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

pkg/agent/testdata/AKSUbuntu2204+Containerd+MIG+ArtifactStreaming/CustomData

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

pkg/agent/testdata/AKSUbuntu2204+Containerd+MIG/CustomData

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)