-
Notifications
You must be signed in to change notification settings - Fork 250
Expand file tree
/
Copy pathvalidators.go
More file actions
1625 lines (1415 loc) · 71 KB
/
validators.go
File metadata and controls
1625 lines (1415 loc) · 71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package e2e
import (
"bytes"
"context"
"crypto/x509"
"encoding/base64"
"encoding/json"
"encoding/pem"
"fmt"
"net"
"os"
"regexp"
"strings"
"testing"
"time"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
"github.com/blang/semver"
"github.com/samber/lo"
"github.com/tidwall/gjson"
"github.com/Azure/agentbaker/e2e/config"
"github.com/Azure/agentbaker/pkg/agent"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
certv1 "k8s.io/api/certificates/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
)
func ValidateTLSBootstrapping(ctx context.Context, s *Scenario) {
switch s.VHD.OS {
case config.OSWindows:
validateTLSBootstrappingWindows(ctx, s)
default:
validateTLSBootstrappingLinux(ctx, s)
}
}
func validateTLSBootstrappingLinux(ctx context.Context, s *Scenario) {
ValidateDirectoryContent(ctx, s, "/var/lib/kubelet", []string{"kubeconfig"})
ValidateDirectoryContent(ctx, s, "/var/lib/kubelet/pki", []string{"kubelet-client-current.pem"})
kubeletLogs := execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo journalctl -u kubelet", 0, "could not retrieve kubelet logs with journalctl").stdout
switch {
case s.SecureTLSBootstrappingEnabled() && s.Tags.BootstrapTokenFallback:
s.T.Logf("will validate bootstrapping mode: secure TLS bootstrapping failure with bootstrap token fallback")
ValidateSystemdUnitIsNotRunning(ctx, s, "secure-tls-bootstrap")
require.True(
s.T,
!strings.Contains(kubeletLogs, "unable to validate bootstrap credentials") && strings.Contains(kubeletLogs, "kubelet bootstrap token credential is valid"),
"expected to have successfully validated bootstrap token credential before kubelet startup, but did not",
)
case s.SecureTLSBootstrappingEnabled():
s.T.Logf("will validate bootstrapping mode: secure TLS bootstrapping")
ValidateSystemdUnitIsRunning(ctx, s, "secure-tls-bootstrap")
validateKubeletClientCSRCreatedBySecureTLSBootstrapping(ctx, s)
require.True(
s.T,
!strings.Contains(kubeletLogs, "unable to validate bootstrap credentials") && strings.Contains(kubeletLogs, "client credential already exists within kubeconfig"),
"expected to already have a valid kubeconfig before kubelet start-up obtained through secure TLS bootstrapping, but did not",
)
default:
s.T.Logf("will validate bootstrapping mode: bootstrap token")
ValidateSystemdUnitIsNotRunning(ctx, s, "secure-tls-bootstrap")
ValidateSystemdUnitIsNotFailed(ctx, s, "secure-tls-bootstrap")
require.True(
s.T,
!strings.Contains(kubeletLogs, "unable to validate bootstrap credentials") && strings.Contains(kubeletLogs, "kubelet bootstrap token credential is valid"),
"expected to have successfully validated bootstrap token credential before kubelet startup, but did not",
)
}
}
func validateTLSBootstrappingWindows(ctx context.Context, s *Scenario) {
ValidateDirectoryContent(ctx, s, "c:\\k", []string{" config "})
ValidateDirectoryContent(ctx, s, "c:\\k\\pki", []string{"kubelet-client-current.pem"})
switch {
case s.SecureTLSBootstrappingEnabled() && s.Tags.BootstrapTokenFallback:
s.T.Logf("will validate bootstrapping mode: secure TLS bootstrapping failure with bootstrap token fallback")
// nothing to validate other than node readiness
case s.SecureTLSBootstrappingEnabled():
s.T.Logf("will validate bootstrapping mode: secure TLS bootstrapping")
validateKubeletClientCSRCreatedBySecureTLSBootstrapping(ctx, s)
default:
s.T.Logf("will validate bootstrapping mode: bootstrap token")
// nothing to validate other than node readiness
}
}
func ValidateKubeletServingCertificateRotation(ctx context.Context, s *Scenario) {
switch s.VHD.OS {
case config.OSWindows:
validateKubeletServingCertificateRotationWindows(ctx, s)
default:
validateKubeletServingCertificateRotationLinux(ctx, s)
}
}
func validateKubeletServingCertificateRotationLinux(ctx context.Context, s *Scenario) {
if _, ok := s.Runtime.VM.VMSS.Tags["aks-disable-kubelet-serving-certificate-rotation"]; ok {
s.T.Logf("linux VMSS has KSCR disablement tag, will validate that KSCR has been disabled")
ValidateDirectoryContent(ctx, s, "/etc/kubernetes/certs", []string{"kubeletserver.crt", "kubeletserver.key"})
ValidateFileExcludesContent(ctx, s, "/etc/default/kubelet", "kubernetes.azure.com/kubelet-serving-ca=cluster")
if s.KubeletConfigFileEnabled() {
ValidateFileHasContent(ctx, s, "/etc/default/kubeletconfig.json", "\"tlsCertFile\": \"/etc/kubernetes/certs/kubeletserver.crt\"")
ValidateFileHasContent(ctx, s, "/etc/default/kubeletconfig.json", "\"tlsPrivateKeyFile\": \"/etc/kubernetes/certs/kubeletserver.key\"")
ValidateFileExcludesContent(ctx, s, "/etc/default/kubeletconfig.json", "\"serverTLSBootstrap\": true")
} else {
ValidateFileHasContent(ctx, s, "/etc/default/kubelet", "--tls-cert-file")
ValidateFileHasContent(ctx, s, "/etc/default/kubelet", "--tls-private-key-file")
ValidateFileExcludesContent(ctx, s, "/etc/default/kubelet", "--rotate-server-certificates=true")
}
return
}
s.T.Logf("will validate linux KSCR enablement")
ValidateDirectoryContent(ctx, s, "/var/lib/kubelet/pki", []string{"kubelet-server-current.pem"})
ValidateFileHasContent(ctx, s, "/etc/default/kubelet", "kubernetes.azure.com/kubelet-serving-ca=cluster")
if s.KubeletConfigFileEnabled() {
ValidateFileExcludesContent(ctx, s, "/etc/default/kubeletconfig.json", "\"tlsCertFile\": \"/etc/kubernetes/certs/kubeletserver.crt\"")
ValidateFileExcludesContent(ctx, s, "/etc/default/kubeletconfig.json", "\"tlsPrivateKeyFile\": \"/etc/kubernetes/certs/kubeletserver.key\"")
ValidateFileHasContent(ctx, s, "/etc/default/kubeletconfig.json", "\"serverTLSBootstrap\": true")
} else {
ValidateFileExcludesContent(ctx, s, "/etc/default/kubelet", "--tls-cert-file")
ValidateFileExcludesContent(ctx, s, "/etc/default/kubelet", "--tls-private-key-file")
ValidateFileHasContent(ctx, s, "/etc/default/kubelet", "--rotate-server-certificates=true")
}
}
func validateKubeletServingCertificateRotationWindows(ctx context.Context, s *Scenario) {
if _, ok := s.Runtime.VM.VMSS.Tags["aks-disable-kubelet-serving-certificate-rotation"]; ok {
s.T.Logf("windows VMSS has KSCR disablement tag, will validate that KSCR has been disabled")
ValidateDirectoryContent(ctx, s, "c:\\k\\pki", []string{"kubelet.crt", "kubelet.key"})
ValidateWindowsProcessDoesNotContainArgumentStrings(ctx, s, "kubelet.exe", []string{"--rotate-server-certificates=true", "kubernetes.azure.com/kubelet-serving-ca=cluster"})
return
}
s.T.Logf("will validate windows KSCR enablement")
ValidateDirectoryContent(ctx, s, "c:\\k\\pki", []string{"kubelet-server-current.pem"})
ValidateWindowsProcessContainsArgumentStrings(ctx, s, "kubelet.exe", []string{"--rotate-server-certificates=true", "kubernetes.azure.com/kubelet-serving-ca=cluster"})
ValidateWindowsProcessDoesNotContainArgumentStrings(ctx, s, "kubelet.exe", []string{"--tls-cert-file", "--tls-private-key-file"})
}
func validateKubeletClientCSRCreatedBySecureTLSBootstrapping(ctx context.Context, s *Scenario) {
fieldSelector := fmt.Sprintf("spec.signerName=%s", certv1.KubeAPIServerClientKubeletSignerName)
kubeletClientCSRs, err := s.Runtime.Cluster.Kube.Typed.CertificatesV1().CertificateSigningRequests().List(ctx, metav1.ListOptions{
FieldSelector: fieldSelector,
})
require.NoError(s.T, err, "failed to list CSRs with field selector: %s", fieldSelector)
var hasValidCSR bool
for _, csr := range kubeletClientCSRs.Items {
if len(csr.Status.Certificate) == 0 {
continue
}
if strings.HasPrefix(strings.ToLower(csr.Spec.Username), "system:bootstrap:") {
continue
}
if getNodeNameFromCSR(s, csr) == s.Runtime.VM.KubeName {
hasValidCSR = true
break
}
}
require.True(s.T, hasValidCSR, "expected node %s to have created a kubelet client CSR which was approved and issued, using secure TLS bootstrapping", s.Runtime.VM.KubeName)
}
func getNodeNameFromCSR(s *Scenario, csr certv1.CertificateSigningRequest) string {
block, _ := pem.Decode(csr.Spec.Request)
require.NotNil(s.T, block)
req, err := x509.ParseCertificateRequest(block.Bytes)
require.NoError(s.T, err)
return strings.TrimPrefix(req.Subject.CommonName, "system:node:")
}
func ValidateSystemdWatchdogForKubernetes132Plus(ctx context.Context, s *Scenario) {
if k8sVersion := s.GetK8sVersion(); k8sVersion != "" && agent.IsKubernetesVersionGe(k8sVersion, "1.32.0") {
// Validate systemd watchdog is enabled and configured for kubelet
ValidateSystemdUnitIsRunning(ctx, s, "kubelet.service")
ValidateFileHasContent(ctx, s, "/etc/systemd/system/kubelet.service.d/10-watchdog.conf", "WatchdogSec=60s")
ValidateJournalctlOutput(ctx, s, "kubelet.service", "Starting systemd watchdog with interval")
}
}
func ValidateAKSLogCollector(ctx context.Context, s *Scenario) {
ValidateSystemdUnitIsNotFailed(ctx, s, "aks-log-collector")
}
func ValidateDiskQueueService(ctx context.Context, s *Scenario) {
ValidateSystemdUnitIsRunning(ctx, s, "disk_queue.service")
}
func ValidateLeakedSecrets(ctx context.Context, s *Scenario) {
secrets := map[string]string{
"client private key": base64.StdEncoding.EncodeToString([]byte(s.GetClientPrivateKey())),
"service principal secret": base64.StdEncoding.EncodeToString([]byte(s.GetServicePrincipalSecret())),
"bootstrap token": s.GetTLSBootstrapToken(),
}
for _, logFile := range []string{"/var/log/azure/cluster-provision.log", "/var/log/azure/aks-node-controller.log"} {
for _, secretValue := range secrets {
if secretValue != "" {
ValidateFileExcludesExactContent(ctx, s, logFile, secretValue)
}
}
}
}
func ValidateSSHServiceEnabled(ctx context.Context, s *Scenario) {
// Verify SSH service is active and running
ValidateSystemdUnitIsRunning(ctx, s, "ssh")
// Verify socket-based activation is disabled
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, "systemctl is-active ssh.socket", 3, "could not check ssh.socket status")
require.Contains(s.T, execResult.stdout, "inactive", "ssh.socket should be inactive")
// Check that systemd recognizes SSH service should be active at boot
execResult = execScriptOnVMForScenarioValidateExitCode(ctx, s, "systemctl is-enabled ssh.service", 0, "could not check ssh.service status")
require.Contains(s.T, execResult.stdout, "enabled", "ssh.service should be enabled at boot")
}
func ValidateDirectoryContent(ctx context.Context, s *Scenario, path string, files []string) {
s.T.Helper()
var steps []string
if s.IsWindows() {
steps = []string{
"$ErrorActionPreference = \"Stop\"",
fmt.Sprintf("Get-ChildItem -Path %s", path),
}
} else {
steps = []string{
"set -ex",
fmt.Sprintf("sudo ls -la %s", path),
}
}
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(steps, "\n"), 0, "could not get directory contents")
for _, file := range files {
require.Contains(s.T, execResult.stdout, file, "expected to find file %s within directory %s, but did not.\nDirectory contents:\n%s", file, path, execResult.stdout)
}
}
func ValidateSysctlConfig(ctx context.Context, s *Scenario, customSysctls map[string]string) {
s.T.Helper()
keysToCheck := make([]string, len(customSysctls))
for k := range customSysctls {
keysToCheck = append(keysToCheck, k)
}
command := []string{
"set -ex",
fmt.Sprintf("sudo sysctl %s | sed -E 's/([0-9])\\s+([0-9])/\\1 \\2/g'", strings.Join(keysToCheck, " ")),
}
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "systmctl command failed")
for name, value := range customSysctls {
require.Contains(s.T, execResult.stdout, fmt.Sprintf("%s = %v", name, value), "expected to find %s set to %v, but was not.\nStdout:\n%s", name, value, execResult.stdout)
}
}
func ValidateNvidiaSMINotInstalled(ctx context.Context, s *Scenario) {
s.T.Helper()
command := []string{
"set -ex",
"sudo nvidia-smi",
}
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 1, "")
require.Contains(s.T, execResult.stderr, "nvidia-smi: command not found", "expected stderr to contain 'nvidia-smi: command not found', but got %q", execResult.stderr)
}
func ValidateNvidiaSMIInstalled(ctx context.Context, s *Scenario) {
s.T.Helper()
command := []string{"set -ex", "sudo nvidia-smi"}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "could not execute nvidia-smi command")
}
func ValidateNvidiaModProbeInstalled(ctx context.Context, s *Scenario) {
s.T.Helper()
command := []string{
"set -ex",
"sudo nvidia-modprobe",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "could not execute nvidia-modprobe command")
}
func ValidateNvidiaGRIDLicenseValid(ctx context.Context, s *Scenario) {
s.T.Helper()
command := []string{
"set -ex",
// Capture the license status output, or continue silently if not found
"license_status=$(sudo nvidia-smi -q | grep 'License Status' | grep 'Licensed' || true)",
// If the output is empty, print an error message and exit with a nonzero code
"if [ -z \"$license_status\" ]; then echo 'License status not valid or not found'; exit 1; fi",
// Check that nvidia-gridd is active by capturing its is-active output
"active_status=$(sudo systemctl is-active nvidia-gridd)",
"if [ \"$active_status\" != \"active\" ]; then echo \"nvidia-gridd is not active: $active_status\"; exit 1; fi",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to validate nvidia-smi license state or nvidia-gridd service status")
}
func ValidateNvidiaPersistencedRunning(ctx context.Context, s *Scenario) {
s.T.Helper()
command := []string{
"set -ex",
// Check that nvidia-persistenced.service is active by capturing its is-active output
"active_status=$(sudo systemctl is-active nvidia-persistenced.service)",
"if [ \"$active_status\" != \"active\" ]; then echo \"nvidia-gridd is not active: $active_status\"; exit 1; fi",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to validate nvidia-persistenced.service status")
}
func ValidateNonEmptyDirectory(ctx context.Context, s *Scenario, dirName string) {
s.T.Helper()
command := []string{
"set -ex",
fmt.Sprintf("sudo ls -1q %s | grep -q '^.*$' && true || false", dirName),
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "either could not find expected file, or something went wrong")
}
func ValidateFileExists(ctx context.Context, s *Scenario, fileName string) {
s.T.Helper()
if !fileExist(ctx, s, fileName) {
s.T.Fatalf("expected file %s, but it does not", fileName)
}
}
func ValidateFileDoesNotExist(ctx context.Context, s *Scenario, fileName string) {
s.T.Helper()
if fileExist(ctx, s, fileName) {
s.T.Fatalf("expected file %s to no exist, but it does", fileName)
}
}
func ValidateFileIsRegularFile(ctx context.Context, s *Scenario, fileName string) {
s.T.Helper()
steps := []string{
"set -ex",
fmt.Sprintf("stat --printf=%%F %s | grep 'regular file'", fileName),
}
if execScriptOnVMForScenario(ctx, s, strings.Join(steps, "\n")).exitCode != "0" {
s.T.Fatalf("expected %s to be a regular file, but it is not", fileName)
}
}
func fileExist(ctx context.Context, s *Scenario, fileName string) bool {
s.T.Helper()
if s.IsWindows() {
steps := []string{
"$ErrorActionPreference = \"Stop\"",
fmt.Sprintf("if (Test-Path -Path '%s') { exit 0 } else { exit 1 }", fileName),
}
execResult := execScriptOnVMForScenario(ctx, s, strings.Join(steps, "\n"))
s.T.Logf("stdout: %s\nstderr: %s", execResult.stdout, execResult.stderr)
return execResult.exitCode == "0"
} else {
steps := []string{
"set -ex",
fmt.Sprintf("test -f %s", fileName),
}
execResult := execScriptOnVMForScenario(ctx, s, strings.Join(steps, "\n"))
return execResult.exitCode == "0"
}
}
func fileHasContent(ctx context.Context, s *Scenario, fileName string, contents string) bool {
s.T.Helper()
require.NotEmpty(s.T, contents, "Test setup failure: Can't validate that a file has contents with an empty string. Filename: %s", fileName)
if s.IsWindows() {
steps := []string{
"$ErrorActionPreference = \"Stop\"",
fmt.Sprintf("Get-Content %s", fileName),
fmt.Sprintf("if ( -not ( Test-Path -Path %s ) ) { exit 2 }", fileName),
fmt.Sprintf("if (Select-String -Path %s -Pattern \"%s\" -SimpleMatch -Quiet) { exit 0 } else { exit 1 }", fileName, contents),
}
execResult := execScriptOnVMForScenario(ctx, s, strings.Join(steps, "\n"))
return execResult.exitCode == "0"
} else {
steps := []string{
"set -ex",
fmt.Sprintf("sudo cat %s", fileName),
fmt.Sprintf("(sudo cat %s | grep -q -F -e %q)", fileName, contents),
}
execResult := execScriptOnVMForScenario(ctx, s, strings.Join(steps, "\n"))
return execResult.exitCode == "0"
}
}
func fileHasExactContent(ctx context.Context, s *Scenario, fileName string, contents string) bool {
s.T.Helper()
require.NotEmpty(s.T, contents, "Test setup failure: Can't validate that a file has contents with an empty string. Filename: %s", fileName)
encodedPattern := base64.StdEncoding.EncodeToString([]byte(contents))
if s.IsWindows() {
steps := []string{
"$ErrorActionPreference = \"Stop\"",
fmt.Sprintf("if ( -not ( Test-Path -Path %s ) ) { exit 2 }", fileName),
fmt.Sprintf("$pattern = [Text.Encoding]::UTF8.GetString([Convert]::FromBase64String('%s'))", encodedPattern),
fmt.Sprintf("$content = Get-Content -Path %s -Raw", fileName),
"$escaped = [regex]::Escape($pattern)",
"if ([regex]::Match($content, \"(?<!\\w)\" + $escaped + \"(?!\\w)\").Success) { exit 0 } else { exit 1 }",
}
execResult := execScriptOnVMForScenario(ctx, s, strings.Join(steps, "\n"))
return execResult.exitCode == "0"
} else {
steps := []string{
"set -ex",
fmt.Sprintf("if [ ! -f %s ]; then exit 2; fi", fileName),
fmt.Sprintf("pattern=$(printf '%%s' '%s' | base64 -d)", encodedPattern),
"escaped=$(printf '%s\n' \"$pattern\" | sed -e 's/[.\\[\\()*?^$+{}|]/\\\\&/g')",
"regex='(^|[^[:alnum:]_])'\"$escaped\"'([^[:alnum:]_]|$)'",
fmt.Sprintf("if sudo grep -Eq \"$regex\" %s; then exit 0; else exit 1; fi", fileName),
}
execResult := execScriptOnVMForScenario(ctx, s, strings.Join(steps, "\n"))
return execResult.exitCode == "0"
}
}
// ValidateFileHasContent passes the test if the specified file contains the specified contents.
// The contents doesn't need to be surrounded by non-word characters.
// E.g.: searching "bcd" in "abcdef" is a match, thus the validation passes.
func ValidateFileHasContent(ctx context.Context, s *Scenario, fileName string, contents string) {
s.T.Helper()
if !fileHasContent(ctx, s, fileName, contents) {
s.T.Fatalf("expected file %s to have contents %q, but it does not", fileName, contents)
}
}
// ValidateFileExcludesContent fails the test if the specified file contains the specified contents.
// The contents doesn't need to be surrounded by non-word characters.
// E.g.: searching "bcd" in "abcdef" is a match, thus the validation fails.
func ValidateFileExcludesContent(ctx context.Context, s *Scenario, fileName string, contents string) {
s.T.Helper()
if fileHasContent(ctx, s, fileName, contents) {
s.T.Fatalf("expected file %s to not have contents %q, but it does", fileName, contents)
}
}
// ValidateFileExcludesExactContent fails the test if the specified file contains the specified contents.
// The contents needs to be surrounded by non-word characters.
// E.g.: searching "bcd" in "abcdef" is not a match, thus the validation passes.
func ValidateFileExcludesExactContent(ctx context.Context, s *Scenario, fileName string, contents string) {
s.T.Helper()
if fileHasExactContent(ctx, s, fileName, contents) {
s.T.Fatalf("expected file %s to not have exact contents %q, but it does", fileName, contents)
}
}
func ServiceCanRestartValidator(ctx context.Context, s *Scenario, serviceName string, restartTimeoutInSeconds int) {
s.T.Helper()
steps := []string{
"set -ex",
// Verify the service is active - print the state then verify so we have logs
fmt.Sprintf("(systemctl -n 5 status %s || true)", serviceName),
fmt.Sprintf("systemctl is-active %s", serviceName),
// get the PID of the service, so we can check it's changed
fmt.Sprintf("INITIAL_PID=`sudo pgrep %s`", serviceName),
"echo INITIAL_PID: $INITIAL_PID",
// we use systemctl kill rather than kill -9 because container restrictions stop us sending a kill sig to a process
fmt.Sprintf("sudo systemctl kill %s", serviceName),
// sleep for restartTimeoutInSeconds seconds to give the service time tor restart
fmt.Sprintf("sleep %d", restartTimeoutInSeconds),
// print the status of the service and then verify it is active.
fmt.Sprintf("(systemctl -n 5 status %s || true)", serviceName),
fmt.Sprintf("systemctl is-active %s", serviceName),
// get the PID of the service after restart, so we can check it's changed
fmt.Sprintf("POST_PID=`sudo pgrep %s`", serviceName),
"echo POST_PID: $POST_PID",
// verify the PID has changed.
"if [[ \"$INITIAL_PID\" == \"$POST_PID\" ]]; then echo PID did not change after restart, failing validator. ; exit 1; fi",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(steps, "\n"), 0, "command to restart service failed")
}
func ValidateSystemdUnitIsRunning(ctx context.Context, s *Scenario, serviceName string) {
s.T.Helper()
command := []string{
"set -ex",
// Print the service status for logging purposes
fmt.Sprintf("systemctl -n 5 status %s || true", serviceName),
// Verify the service is active
fmt.Sprintf("systemctl is-active %s", serviceName),
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0,
fmt.Sprintf("service %s is not running", serviceName))
}
func ValidateSystemdUnitIsNotRunning(ctx context.Context, s *Scenario, serviceName string) {
s.T.Helper()
command := []string{
"set -ex",
// Print the service status for logging purposes (allow failure)
fmt.Sprintf("systemctl -n 5 status %s || true", serviceName),
// Check if service is active - we expect this to fail
fmt.Sprintf("! systemctl is-active %s", serviceName),
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0,
fmt.Sprintf("service %s is unexpectedly running", serviceName))
}
func ValidateWindowsServiceIsRunning(ctx context.Context, s *Scenario, serviceName string) {
s.T.Helper()
command := []string{
"$ErrorActionPreference = \"Stop\"",
// Print the service status for logging purposes
fmt.Sprintf("Get-Service -Name %s", serviceName),
// Verify the service is running
fmt.Sprintf("$service = Get-Service -Name %s", serviceName),
"if ($service.Status -ne 'Running') { throw \"Service is not running: $($service.Status)\" }",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0,
fmt.Sprintf("Windows service %s is not running", serviceName))
}
func ValidateWindowsServiceIsNotRunning(ctx context.Context, s *Scenario, serviceName string) {
s.T.Helper()
command := []string{
"$ErrorActionPreference = \"Continue\"",
// Print the service status for logging purposes
fmt.Sprintf("Get-Service -Name %s -ErrorAction SilentlyContinue", serviceName),
// Check if service exists and is not running
fmt.Sprintf("$service = Get-Service -Name %s -ErrorAction SilentlyContinue", serviceName),
"if ($service -and $service.Status -eq 'Running') { throw \"Service is unexpectedly running: $($service.Status)\" }",
"if ($service -and $service.Status -ne 'Running') { Write-Host \"Service exists but is not running: $($service.Status)\" }",
"if (-not $service) { Write-Host \"Service does not exist\" }",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0,
fmt.Sprintf("Windows service %s validation failed", serviceName))
}
func ValidateSystemdUnitIsNotFailed(ctx context.Context, s *Scenario, serviceName string) {
s.T.Helper()
command := []string{
"set -ex",
fmt.Sprintf("systemctl --no-pager -n 5 status %s || true", serviceName),
fmt.Sprintf("systemctl is-failed %s", serviceName),
}
require.NotEqual(
s.T,
"0",
execScriptOnVMForScenario(ctx, s, strings.Join(command, "\n")).exitCode,
`expected "systemctl is-failed" to exit with a non-zero exit code for unit %q, unit is in a failed state`,
serviceName,
)
}
func ValidateNoFailedSystemdUnits(ctx context.Context, s *Scenario) {
unitFailureAllowList := map[string]bool{
// this service depends on non-network-isolated environment - E2Es are run in an environment
// which simulates network-isolation by only allowing egress to recommended domains outlined
// on public AKS documentation via a firewall. This service depends on some other domain which is
// not currently allowed by the firewall. It also seems that this service is only installed on
// Ubuntu - do we even need it? it seems that it's coming from the base image
"fwupd-refresh.service": true,
}
if s.Tags.BootstrapTokenFallback {
// secure-tls-bootstrap.service is expected to fail within scenarios that test bootstrap token fall-back behavior
unitFailureAllowList["secure-tls-bootstrap.service"] = true
}
if s.VHD.IgnoreFailedCgroupTelemetryServices {
unitFailureAllowList["cgroup-memory-telemetry.service"] = true
unitFailureAllowList["cgroup-pressure-telemetry.service"] = true
}
type systemdUnit struct {
Name string `json:"unit,omitempty"`
}
var failedUnits []systemdUnit
result := execScriptOnVMForScenarioValidateExitCode(ctx, s, "systemctl list-units --failed --output json", 0, fmt.Sprintf("unable to list failed systemd units"))
assert.NoError(s.T, json.Unmarshal([]byte(result.stdout), &failedUnits), `unable to parse and unmarshal "systemctl list-units" command output`)
failedUnits = lo.Filter(failedUnits, func(unit systemdUnit, _ int) bool {
return !unitFailureAllowList[unit.Name]
})
if len(failedUnits) < 1 {
// no unexpectedly failed units
return
}
// extract failed unit logs
failedUnitLogs := make(map[string]string, len(failedUnits))
for _, unit := range failedUnits {
failedUnitLogs[unit.Name+".log"] = execScriptOnVMForScenario(ctx, s, fmt.Sprintf("journalctl -u %s", unit.Name)).String()
}
assert.NoError(s.T, dumpFileMapToDir(s.T, failedUnitLogs), "failed to dump failed systemd unit logs")
s.T.Fatalf(
"the following systemd units have unexpectedly entered a failed state: %s - failed unit logs will be included in scenario log bundle within <service-name>.service.log",
lo.Map(failedUnits, func(unit systemdUnit, _ int) string {
return unit.Name
}),
)
}
func ValidateUlimitSettings(ctx context.Context, s *Scenario, ulimits map[string]string) {
s.T.Helper()
ulimitKeys := make([]string, 0, len(ulimits))
for k := range ulimits {
ulimitKeys = append(ulimitKeys, k)
}
command := fmt.Sprintf("sudo systemctl cat containerd.service | grep -E -i '%s'", strings.Join(ulimitKeys, "|"))
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 0, "could not read containerd.service file")
for name, value := range ulimits {
require.Contains(s.T, execResult.stdout, fmt.Sprintf("%s=%v", name, value), "expected to find %s set to %v, but was not", name, value)
}
}
func ValidateInstalledPackageVersion(ctx context.Context, s *Scenario, component, version string) {
s.T.Helper()
installedCommand := func() string {
switch s.VHD.OS {
case config.OSUbuntu:
return "sudo apt list --installed"
case config.OSMariner, config.OSAzureLinux:
return "sudo dnf list installed"
default:
s.T.Fatalf("command to get package list isn't implemented for OS %s", s.VHD.OS)
return ""
}
}()
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, installedCommand, 0, "could not get package list")
for _, line := range strings.Split(execResult.stdout, "\n") {
if strings.Contains(line, component) && strings.Contains(line, version) {
s.T.Logf("found %s %s in the installed packages", component, version)
return
}
}
s.T.Errorf("expected to find %s %s in the installed packages, but did not", component, version)
}
func ValidateKubeletNodeIP(ctx context.Context, s *Scenario) {
s.T.Helper()
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo cat /etc/default/kubelet", 0, "could not read kubelet config")
stdout := execResult.stdout
// Search for "--node-ip" flag and its value.
matches := regexp.MustCompile(`--node-ip=([a-zA-Z0-9.,]*)`).FindStringSubmatch(stdout)
require.NotNil(s.T, matches, "could not find kubelet flag --node-ip\nStdout: \n%s", stdout)
require.GreaterOrEqual(s.T, len(matches), 2, "could not find kubelet flag --node-ip.\nStdout: \n%s", stdout)
ipAddresses := strings.Split(matches[1], ",") // Could be multiple for dual-stack.
require.GreaterOrEqual(s.T, len(ipAddresses), 1, "expected at least one --node-ip address, but got none\nStdout: \n%s", stdout)
require.LessOrEqual(s.T, len(ipAddresses), 2, "expected at most two --node-ip addresses, but got %d\nStdout: \n%s", len(ipAddresses), stdout)
// Check that each IP is a valid address.
for _, ipAddress := range ipAddresses {
require.NotNil(s.T, net.ParseIP(ipAddress), "--node-ip value %q is not a valid IP address\nStdout: \n%s", ipAddress, stdout)
}
}
func ValidateIMDSRestrictionRule(ctx context.Context, s *Scenario, table string) {
s.T.Helper()
cmd := fmt.Sprintf("sudo iptables -t %s -S | grep -q 'AKS managed: added by AgentBaker ensureIMDSRestriction for IMDS restriction feature'", table)
execScriptOnVMForScenarioValidateExitCode(ctx, s, cmd, 0, "expected to find IMDS restriction rule, but did not")
}
func ValidateMultipleKubeProxyVersionsExist(ctx context.Context, s *Scenario) {
s.T.Helper()
execResult := execScriptOnVMForScenario(ctx, s, "sudo ctr --namespace k8s.io images list | grep kube-proxy | awk '{print $1}' | grep -oE '[0-9]+\\.[0-9]+\\.[0-9]+'")
if execResult.exitCode != "0" {
s.T.Errorf("Failed to list kube-proxy images: %s", execResult.stderr)
return
}
versions := bytes.NewBufferString(strings.TrimSpace(execResult.stdout))
versionMap := make(map[string]struct{})
for _, version := range strings.Split(versions.String(), "\n") {
if version != "" {
versionMap[version] = struct{}{}
}
}
switch len(versionMap) {
case 0:
s.T.Errorf("No kube-proxy versions found.")
case 1:
s.T.Errorf("Only one kube-proxy version exists: %v", versionMap)
default:
s.T.Logf("Multiple kube-proxy versions exist: %v", versionMap)
}
}
func ValidateKubeletHasNotStopped(ctx context.Context, s *Scenario) {
s.T.Helper()
command := "sudo journalctl -u kubelet"
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 0, "could not retrieve kubelet logs with journalctl")
stdout := strings.ToLower(execResult.stdout)
assert.NotContains(s.T, stdout, "stopped kubelet")
assert.Contains(s.T, stdout, "started kubelet")
}
func ValidateServicesDoNotRestartKubelet(ctx context.Context, s *Scenario) {
s.T.Helper()
// grep all filesin /etc/systemd/system/ for /restart\s+kubelet/ and count results
command := "sudo grep -rl 'restart[[:space:]]\\+kubelet' /etc/systemd/system/"
execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 1, "expected to find no services containing 'restart kubelet' in /etc/systemd/system/")
}
// ValidateKubeletHasFlags checks kubelet is started with the right flags and configs.
func ValidateKubeletHasFlags(ctx context.Context, s *Scenario, filePath string) {
s.T.Helper()
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo journalctl -u kubelet", 0, "could not retrieve kubelet logs with journalctl")
configFileFlags := fmt.Sprintf("FLAG: --config=\"%s\"", filePath)
require.Containsf(s.T, execResult.stdout, configFileFlags, "expected to find flag %s, but not found", "config")
}
func ValidateContainerd2Properties(ctx context.Context, s *Scenario, versions []string) {
s.T.Helper()
require.Lenf(s.T, versions, 1, "Expected exactly one version for moby-containerd but got %d", len(versions))
// assert versions[0] value starts with '2.'
require.Truef(s.T, strings.HasPrefix(versions[0], "2."), "expected moby-containerd version to start with '2.', got %v", versions[0])
ValidateInstalledPackageVersion(ctx, s, "moby-containerd", versions[0])
execResult := execOnVMForScenarioOnUnprivilegedPod(ctx, s, "containerd config dump ")
// validate containerd config dump has no warnings
require.NotContains(s.T, execResult.stdout, "level=warning", "do not expect warning message when converting config file %", execResult.stdout)
}
func ValidateContainerRuntimePlugins(ctx context.Context, s *Scenario) {
// nri plugin is enabled by default
ValidateDirectoryContent(ctx, s, "/var/run/nri", []string{"nri.sock"})
}
func ValidateNPDGPUCountPlugin(ctx context.Context, s *Scenario) {
s.T.Helper()
command := []string{
"set -ex",
// Check NPD GPU count plugin config exists
"test -f /etc/node-problem-detector.d/custom-plugin-monitor/gpu_checks/custom-plugin-gpu-count.json",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "NPD GPU count plugin configuration does not exist")
}
func validateNPDCondition(ctx context.Context, s *Scenario, conditionType, conditionReason string, conditionStatus corev1.ConditionStatus, conditionMessage, conditionMessageErr string) {
s.T.Helper()
// Wait for NPD to report initial condition
var condition *corev1.NodeCondition
err := wait.PollUntilContextTimeout(ctx, 2*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) {
node, err := s.Runtime.Cluster.Kube.Typed.CoreV1().Nodes().Get(ctx, s.Runtime.VM.KubeName, metav1.GetOptions{})
if err != nil {
s.T.Logf("Failed to get node %q: %v", s.Runtime.VM.KubeName, err)
return false, nil // Continue polling on transient errors
}
// Check for condition with correct reason
for i := range node.Status.Conditions {
if string(node.Status.Conditions[i].Type) == conditionType && string(node.Status.Conditions[i].Reason) == conditionReason {
condition = &node.Status.Conditions[i] // Found the partial condition we are looking for
}
if strings.Contains(node.Status.Conditions[i].Message, conditionMessage) {
condition = &node.Status.Conditions[i]
return true, nil // Found the exact condition we are looking for
}
}
return false, nil // Continue polling until the condition is found or timeout occurs
})
if err != nil && condition == nil {
require.NoError(s.T, err, "timed out waiting for %s condition with reason %s to appear on node %q", conditionType, conditionReason, s.Runtime.VM.KubeName)
}
require.NotNil(s.T, condition, "expected to find %s condition with %s reason on node", conditionType, conditionReason)
require.Equal(s.T, condition.Status, conditionStatus, "expected %s condition to be %s", conditionType, conditionStatus)
require.Contains(s.T, condition.Message, conditionMessage, conditionMessageErr)
}
func ValidateNPDGPUCountCondition(ctx context.Context, s *Scenario) {
s.T.Helper()
// Validate that NPD is reporting healthy GPU count
validateNPDCondition(ctx, s, "GPUMissing", "NoGPUMissing", corev1.ConditionFalse,
"All GPUs are present", "expected GPUMissing message to indicate correct count")
}
func ValidateNPDGPUCountAfterFailure(ctx context.Context, s *Scenario) {
s.T.Helper()
command := []string{
"set -ex",
// Stop all services that are holding on to the GPUs
"sudo systemctl stop nvidia-persistenced.service || true",
"sudo systemctl stop nvidia-fabricmanager || true",
// Disable and reset the first GPU
"sudo nvidia-smi -i 0 -pm 0", // Disable persistence mode
"sudo nvidia-smi -i 0 -c 0", // Set compute mode to default
// sed converts the output into the format needed for NVreg_ExcludeDevices
"PCI_ID=$(sudo nvidia-smi -i 0 --query-gpu=pci.bus_id --format=csv,noheader | sed 's/^0000//')",
"echo ${PCI_ID} | tee /tmp/npd_test_disabled_pci_id",
"echo ${PCI_ID} | sudo tee /sys/bus/pci/drivers/nvidia/unbind", // Reset the GPU
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to disable GPU")
// Validate that NPD reports the GPU count mismatch
validateNPDCondition(ctx, s, "GPUMissing", "GPUMissing", corev1.ConditionTrue,
"Expected to see 8 GPUs but found 7. FaultCode: NHC2009", "expected GPUMissing message to indicate GPU count mismatch")
command = []string{
"set -ex",
"cat /tmp/npd_test_disabled_pci_id | sudo tee /sys/bus/pci/drivers/nvidia/bind",
"rm -f /tmp/npd_test_disabled_pci_id", // Clean up the temporary file
"sudo systemctl start nvidia-persistenced.service || true",
}
// Put the VM back to the original state, re-enable the GPU.
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to re-enable GPU")
}
func ValidateNPDIBLinkFlappingCondition(ctx context.Context, s *Scenario) {
s.T.Helper()
// Validate that NPD is reporting no IB link flapping
validateNPDCondition(ctx, s, "IBLinkFlapping", "NoIBLinkFlapping", corev1.ConditionFalse,
"IB link is stable", "expected IBLinkFlapping message to indicate no flapping")
}
func ValidateNPDIBLinkFlappingAfterFailure(ctx context.Context, s *Scenario) {
s.T.Helper()
// Simulate IB link flapping
command := []string{
"set -ex",
"echo \"$(date '+%b %d %H:%M:%S') $(hostname) fake error 0: [12346.123456] ib0: lost carrier\" | sudo tee -a /var/log/syslog",
"sleep 60",
"echo \"$(date '+%b %d %H:%M:%S') $(hostname) fake error 1: [12346.123456] ib0: lost carrier\" | sudo tee -a /var/log/syslog",
"sleep 60",
"echo \"$(date '+%b %d %H:%M:%S') $(hostname) fake error 2: [12346.123456] ib0: lost carrier\" | sudo tee -a /var/log/syslog",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to simulate IB link flapping")
// Validate that NPD reports IB link flapping
expectedMessage := "check_ib_link_flapping: IB link flapping detected, multiple IB link flapping events within 6 hours. FaultCode: NHC2005"
validateNPDCondition(ctx, s, "IBLinkFlapping", "IBLinkFlapping", corev1.ConditionTrue,
expectedMessage, "expected IBLinkFlapping message to indicate flapping")
}
func ValidateNPDUnhealthyNvidiaDevicePlugin(ctx context.Context, s *Scenario) {
s.T.Helper()
command := []string{
"set -ex",
// Check NPD unhealthy Nvidia device plugin config exists
"test -f /etc/node-problem-detector.d/custom-plugin-monitor/gpu_checks/custom-plugin-nvidia-device-plugin.json",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "NPD Nvidia device plugin configuration does not exist")
}
func ValidateNPDUnhealthyNvidiaDevicePluginCondition(ctx context.Context, s *Scenario) {
s.T.Helper()
// Validate that NPD is reporting healthy Nvidia device plugin
validateNPDCondition(ctx, s, "UnhealthyNvidiaDevicePlugin", "HealthyNvidiaDevicePlugin", corev1.ConditionFalse,
"NVIDIA device plugin is running properly", "expected UnhealthyNvidiaDevicePlugin message to indicate healthy status")
}
func ValidateNPDUnhealthyNvidiaDevicePluginAfterFailure(ctx context.Context, s *Scenario) {
s.T.Helper()
// Stop Nvidia device plugin systemd service to simulate failure
command := []string{
"set -ex",
"sudo systemctl stop nvidia-device-plugin.service",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to stop Nvidia device plugin service")
// Validate that NPD reports unhealthy Nvidia device plugin
validateNPDCondition(ctx, s, "UnhealthyNvidiaDevicePlugin", "UnhealthyNvidiaDevicePlugin", corev1.ConditionTrue,
"Systemd service nvidia-device-plugin is not active", "expected UnhealthyNvidiaDevicePlugin message to indicate unhealthy status")
// Restart Nvidia device plugin systemd service
command = []string{
"set -ex",
"sudo systemctl restart nvidia-device-plugin.service || true",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to restart Nvidia device plugin service")
}
func ValidateNPDUnhealthyNvidiaDCGMServices(ctx context.Context, s *Scenario) {
s.T.Helper()
command := []string{
"set -ex",
// Check NPD unhealthy Nvidia DCGM services config exists
"test -f /etc/node-problem-detector.d/custom-plugin-monitor/gpu_checks/custom-plugin-nvidia-dcgm-services.json",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "NPD Nvidia DCGM services configuration does not exist")
}
func ValidateNPDUnhealthyNvidiaDCGMServicesCondition(ctx context.Context, s *Scenario) {
s.T.Helper()
// Validate that NPD is reporting healthy Nvidia DCGM services
validateNPDCondition(ctx, s, "UnhealthyNvidiaDCGMServices", "HealthyNvidiaDCGMServices", corev1.ConditionFalse,
"NVIDIA DCGM services are running properly", "expected UnhealthyNvidiaDCGMServices message to indicate healthy status")
}
func ValidateNPDUnhealthyNvidiaDCGMServicesAfterFailure(ctx context.Context, s *Scenario) {
s.T.Helper()
// Stop nvidia-dcgm systemd service to simulate failure
command := []string{
"set -ex",
"sudo systemctl stop nvidia-dcgm.service",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to stop Nvidia DCGM service")
// Validate that NPD reports unhealthy Nvidia DCGM services
validateNPDCondition(ctx, s, "UnhealthyNvidiaDCGMServices", "UnhealthyNvidiaDCGMServices", corev1.ConditionTrue,
"Systemd service(s) nvidia-dcgm are not active", "expected UnhealthyNvidiaDCGMServices message to indicate unhealthy status")
// Stop the nvidia-dcgm-exporter system service to simulate failure
command = []string{
"set -ex",
"sudo systemctl stop nvidia-dcgm-exporter.service",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to stop Nvidia DCGM Exporter service")
// Validate that NPD still reports unhealthy Nvidia DCGM services
validateNPDCondition(ctx, s, "UnhealthyNvidiaDCGMServices", "UnhealthyNvidiaDCGMServices", corev1.ConditionTrue,
"Systemd service(s) nvidia-dcgm nvidia-dcgm-exporter are not active", "expected UnhealthyNvidiaDCGMServices message to indicate unhealthy status for both services")
// Restart Nvidia DCGM services
command = []string{
"set -ex",
"sudo systemctl restart nvidia-dcgm.service || true",
"sudo systemctl restart nvidia-dcgm-exporter.service || true",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to restart Nvidia DCGM services")
}
func ValidateNPDHealthyNvidiaGridLicenseStatus(ctx context.Context, s *Scenario) {
s.T.Helper()
command := []string{
"set -ex",
// Check NPD unhealthy Nvidia GRID license check config exists
"test -f /etc/node-problem-detector.d/custom-plugin-monitor/gpu_checks/custom-plugin-nvidia-grid-status.json",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "NPD Nvidia Grid License check configuration does not exist")
// Validate that NPD is reporting healthy Nvidia GRID license status
validateNPDCondition(ctx, s, "NVIDIAGRIDStatusInvalid", "NVIDIAGRIDStatusValid", corev1.ConditionFalse,
"NVIDIA Grid Status Valid", "expected NVIDIAGRIDStatusValid message to indicate healthy status")
}
func ValidateNPDUnhealthyNvidiaGridLicenseStatusAfterFailure(ctx context.Context, s *Scenario) {
s.T.Helper()
// Stop nvidia-gridd systemd service to simulate failure
command := []string{
"set -ex",
"sudo systemctl stop nvidia-gridd.service",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to stop Nvidia GRID service")
// Validate that NPD reports unhealthy Nvidia GRID services
validateNPDCondition(ctx, s, "NVIDIA GRID Status Invalid", "NVIDIA GRID Status Valid", corev1.ConditionTrue,
"nvidia-gridd is not active", "expected UnhealthyNVIDIA GRID Status message to indicate unhealthy status")
// Restart Nvidia Grid services
command = []string{
"set -ex",
"sudo systemctl restart nvidia-gridd.service || true",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "failed to restart Nvidia GRID services")
}
func ValidateRuncVersion(ctx context.Context, s *Scenario, versions []string) {
s.T.Helper()
require.Lenf(s.T, versions, 1, "Expected exactly one version for moby-runc but got %d", len(versions))
// check if versions[0] is great than or equal to 1.2.0
// check semantic version
semver, err := semver.ParseTolerant(versions[0])
require.NoError(s.T, err, "failed to parse semver from moby-runc version")
require.GreaterOrEqual(s.T, int(semver.Major), 1, "expected moby-runc major version to be at least 1, got %d", semver.Major)
require.GreaterOrEqual(s.T, int(semver.Minor), 2, "expected moby-runc minor version to be at least 2, got %d", semver.Minor)
ValidateInstalledPackageVersion(ctx, s, "moby-runc", versions[0])
}
func ValidateWindowsProcessHasCliArguments(ctx context.Context, s *Scenario, processName string, arguments []string) {
steps := []string{
fmt.Sprintf("(Get-CimInstance Win32_Process -Filter \"name='%[1]s'\")[0].CommandLine", processName),
}
podExecResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(steps, "\n"), 0, "could not validate command has parameters - might mean file does not have params, might mean something went wrong")
actualArgs := strings.Split(podExecResult.stdout, " ")
for i := range arguments {
expectedArgument := arguments[i]
require.Contains(s.T, actualArgs, expectedArgument)
}
}
func ValidateWindowsProcessContainsArgumentStrings(ctx context.Context, s *Scenario, processName string, substrings []string) {
validateWindowsProccessArgumentString(ctx, s, processName, substrings, require.Contains)
}
func ValidateWindowsProcessDoesNotContainArgumentStrings(ctx context.Context, s *Scenario, processName string, substrings []string) {
validateWindowsProccessArgumentString(ctx, s, processName, substrings, require.NotContains)
}
func validateWindowsProccessArgumentString(ctx context.Context, s *Scenario, processName string, substrings []string, assert func(t require.TestingT, s any, contains any, msgAndArgs ...any)) {
steps := []string{
fmt.Sprintf("(Get-CimInstance Win32_Process -Filter \"name='%[1]s'\")[0].CommandLine", processName),
}