Skip to content

Commit 6807a5c

Browse files
authored
add e2e for nvidia grid license check (#7328)
1 parent 2325fd6 commit 6807a5c

File tree

2 files changed

+52
-6
lines changed

2 files changed

+52
-6
lines changed

e2e/scenario_gpu_managed_experience_test.go

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -115,14 +115,14 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning(t *testing.T) {
115115
Cluster: ClusterKubenet,
116116
VHD: config.VHDUbuntu2404Gen2Containerd,
117117
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
118-
nbc.AgentPoolProfile.VMSize = "Standard_NC6s_v3"
118+
nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5"
119119
nbc.ConfigGPUDriverIfNeeded = true
120120
nbc.EnableGPUDevicePluginIfNeeded = true
121121
nbc.EnableNvidia = true
122122
nbc.ManagedGPUExperienceAFECEnabled = true
123123
},
124124
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
125-
vmss.SKU.Name = to.Ptr("Standard_NC6s_v3")
125+
vmss.SKU.Name = to.Ptr("Standard_NV6ads_A10_v5")
126126
if vmss.Tags == nil {
127127
vmss.Tags = map[string]*string{}
128128
}
@@ -171,6 +171,9 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning(t *testing.T) {
171171
ValidateNPDUnhealthyNvidiaDCGMServices(ctx, s)
172172
ValidateNPDUnhealthyNvidiaDCGMServicesCondition(ctx, s)
173173
ValidateNPDUnhealthyNvidiaDCGMServicesAfterFailure(ctx, s)
174+
// verify nvidia grid license status checks are reporting status correctly
175+
ValidateNPDHealthyNvidiaGridLicenseStatus(ctx, s)
176+
ValidateNPDUnhealthyNvidiaGridLicenseStatusAfterFailure(ctx, s)
174177
},
175178
},
176179
})
@@ -186,14 +189,14 @@ func Test_Ubuntu2204_NvidiaDevicePluginRunning(t *testing.T) {
186189
Cluster: ClusterKubenet,
187190
VHD: config.VHDUbuntu2204Gen2Containerd,
188191
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
189-
nbc.AgentPoolProfile.VMSize = "Standard_NC6s_v3"
192+
nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5"
190193
nbc.ConfigGPUDriverIfNeeded = true
191194
nbc.EnableGPUDevicePluginIfNeeded = true
192195
nbc.EnableNvidia = true
193196
nbc.ManagedGPUExperienceAFECEnabled = true
194197
},
195198
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
196-
vmss.SKU.Name = to.Ptr("Standard_NC6s_v3")
199+
vmss.SKU.Name = to.Ptr("Standard_NV6ads_A10_v5")
197200
if vmss.Tags == nil {
198201
vmss.Tags = map[string]*string{}
199202
}
@@ -241,6 +244,9 @@ func Test_Ubuntu2204_NvidiaDevicePluginRunning(t *testing.T) {
241244
ValidateNPDUnhealthyNvidiaDCGMServices(ctx, s)
242245
ValidateNPDUnhealthyNvidiaDCGMServicesCondition(ctx, s)
243246
ValidateNPDUnhealthyNvidiaDCGMServicesAfterFailure(ctx, s)
247+
// verify nvidia grid license status checks are reporting status correctly
248+
ValidateNPDHealthyNvidiaGridLicenseStatus(ctx, s)
249+
ValidateNPDUnhealthyNvidiaGridLicenseStatusAfterFailure(ctx, s)
244250
},
245251
},
246252
})
@@ -256,14 +262,14 @@ func Test_AzureLinux3_NvidiaDevicePluginRunning(t *testing.T) {
256262
Cluster: ClusterKubenet,
257263
VHD: config.VHDAzureLinuxV3Gen2,
258264
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
259-
nbc.AgentPoolProfile.VMSize = "Standard_NC6s_v3"
265+
nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5"
260266
nbc.ConfigGPUDriverIfNeeded = true
261267
nbc.EnableGPUDevicePluginIfNeeded = true
262268
nbc.EnableNvidia = true
263269
nbc.ManagedGPUExperienceAFECEnabled = true
264270
},
265271
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
266-
vmss.SKU.Name = to.Ptr("Standard_NC6s_v3")
272+
vmss.SKU.Name = to.Ptr("Standard_NV6ads_A10_v5")
267273
if vmss.Tags == nil {
268274
vmss.Tags = map[string]*string{}
269275
}
@@ -311,6 +317,9 @@ func Test_AzureLinux3_NvidiaDevicePluginRunning(t *testing.T) {
311317
ValidateNPDUnhealthyNvidiaDCGMServices(ctx, s)
312318
ValidateNPDUnhealthyNvidiaDCGMServicesCondition(ctx, s)
313319
ValidateNPDUnhealthyNvidiaDCGMServicesAfterFailure(ctx, s)
320+
// verify nvidia grid license status checks are reporting status correctly
321+
ValidateNPDHealthyNvidiaGridLicenseStatus(ctx, s)
322+
ValidateNPDUnhealthyNvidiaGridLicenseStatusAfterFailure(ctx, s)
314323
},
315324
},
316325
})
@@ -389,6 +398,9 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning_MIG(t *testing.T) {
389398
ValidateNPDUnhealthyNvidiaDCGMServices(ctx, s)
390399
ValidateNPDUnhealthyNvidiaDCGMServicesCondition(ctx, s)
391400
ValidateNPDUnhealthyNvidiaDCGMServicesAfterFailure(ctx, s)
401+
// verify nvidia grid license status checks are reporting status correctly
402+
ValidateNPDHealthyNvidiaGridLicenseStatus(ctx, s)
403+
ValidateNPDUnhealthyNvidiaGridLicenseStatusAfterFailure(ctx, s)
392404
},
393405
},
394406
})

e2e/validators.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -695,6 +695,40 @@ func ValidateNPDUnhealthyNvidiaDCGMServicesAfterFailure(ctx context.Context, s *
695695
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to restart Nvidia DCGM services")
696696
}
697697

698+
func ValidateNPDHealthyNvidiaGridLicenseStatus(ctx context.Context, s *Scenario) {
699+
s.T.Helper()
700+
command := []string{
701+
"set -ex",
702+
// Check NPD unhealthy Nvidia GRID license check config exists
703+
"test -f /etc/node-problem-detector.d/custom-plugin-monitor/gpu_checks/custom-plugin-nvidia-grid-status.json",
704+
}
705+
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "NPD Nvidia Grid License check configuration does not exist")
706+
// Validate that NPD is reporting healthy Nvidia GRID license status
707+
validateNPDCondition(ctx, s, "NVIDIAGRIDStatusInvalid", "NVIDIAGRIDStatusValid", corev1.ConditionFalse,
708+
"NVIDIA Grid Status Valid", "expected NVIDIAGRIDStatusValid message to indicate healthy status")
709+
}
710+
711+
func ValidateNPDUnhealthyNvidiaGridLicenseStatusAfterFailure(ctx context.Context, s *Scenario) {
712+
s.T.Helper()
713+
// Stop nvidia-gridd systemd service to simulate failure
714+
command := []string{
715+
"set -ex",
716+
"sudo systemctl stop nvidia-gridd.service",
717+
}
718+
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to stop Nvidia GRID service")
719+
720+
// Validate that NPD reports unhealthy Nvidia GRID services
721+
validateNPDCondition(ctx, s, "NVIDIA GRID Status Invalid", "NVIDIA GRID Status Valid", corev1.ConditionTrue,
722+
"nvidia-gridd is not active", "expected UnhealthyNVIDIA GRID Status message to indicate unhealthy status")
723+
724+
// Restart Nvidia Grid services
725+
command = []string{
726+
"set -ex",
727+
"sudo systemctl restart nvidia-gridd.service || true",
728+
}
729+
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to restart Nvidia GRID services")
730+
}
731+
698732
func ValidateRuncVersion(ctx context.Context, s *Scenario, versions []string) {
699733
s.T.Helper()
700734
require.Lenf(s.T, versions, 1, "Expected exactly one version for moby-runc but got %d", len(versions))

0 commit comments

Comments
 (0)