@@ -6,6 +6,7 @@ package osde2etests
6
6
import (
7
7
"context"
8
8
"fmt"
9
+ "log"
9
10
"os"
10
11
"strings"
11
12
"time"
@@ -16,14 +17,19 @@ import (
16
17
"github.com/onsi/ginkgo/v2"
17
18
. "github.com/onsi/ginkgo/v2"
18
19
. "github.com/onsi/gomega"
20
+ v1beta1 "github.com/openshift/api/machine/v1beta1"
19
21
awsinternal "github.com/openshift/configuration-anomaly-detection/pkg/aws"
22
+ machineutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/machine"
20
23
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
21
24
"github.com/openshift/configuration-anomaly-detection/test/e2e/utils"
22
25
ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm"
23
26
"github.com/openshift/osde2e-common/pkg/clients/openshift"
24
27
appsv1 "k8s.io/api/apps/v1"
25
28
corev1 "k8s.io/api/core/v1"
29
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30
+ "k8s.io/apimachinery/pkg/types"
26
31
"k8s.io/client-go/util/retry"
32
+ pclient "sigs.k8s.io/controller-runtime/pkg/client"
27
33
logger "sigs.k8s.io/controller-runtime/pkg/log"
28
34
)
29
35
@@ -127,7 +133,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
127
133
_ , err = testPdClient .TriggerIncident ("ClusterHasGoneMissing" , clusterID )
128
134
Expect (err ).NotTo (HaveOccurred (), "Failed to trigger silent PagerDuty alert" )
129
135
130
- time .Sleep (3 * time .Minute )
136
+ time .Sleep (5 * time .Minute )
131
137
132
138
lsResponseAfter , err := utils .GetLimitedSupportReasons (ocme2eCli , clusterID )
133
139
Expect (err ).NotTo (HaveOccurred (), "Failed to get limited support reasons" )
@@ -379,4 +385,125 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
379
385
Expect (lsResponseAfter .Items ().Len ()).To (BeNumerically (">" , lsReasonsBefore ),
380
386
"Expected more limited support reasons after infrastructure node shutdown" )
381
387
})
388
+
389
+ It ("AWS CCS: MachineHealthCheckUnterminatedShortCircuitSRE - node is NotReady" , func (ctx context.Context ) {
390
+ if provider == "aws" {
391
+ kubeConfigPath := os .Getenv ("KUBECONFIG" )
392
+ kubeClient , err := utils .CreateClientFromKubeConfig (kubeConfigPath )
393
+ if err != nil {
394
+ log .Fatalf ("Error creating Kubernetes client: %v" , err )
395
+ }
396
+
397
+ // Fetch machine list in the 'openshift-machine-api' namespace
398
+ machineList := & v1beta1.MachineList {}
399
+ err = kubeClient .List (context .TODO (), machineList , & pclient.ListOptions {
400
+ Namespace : machineutil .MachineNamespace ,
401
+ })
402
+ Expect (err ).ToNot (HaveOccurred (), "Failed to list machines" )
403
+
404
+ // Get nodes for the first machine
405
+ machine := & machineList .Items [0 ]
406
+ node , err := machineutil .GetNodeForMachine (ctx , kubeClient , * machine )
407
+ Expect (err ).NotTo (HaveOccurred (), "Failed to get Node for Machine" )
408
+ Expect (node ).NotTo (BeNil (), "Node for Machine is nil" )
409
+
410
+ nodeName := node .Name
411
+ originalNodeCount := len (machineList .Items )
412
+ ginkgo .GinkgoWriter .Printf ("Original node count: %d\n " , originalNodeCount )
413
+
414
+ // Simulate 'NotReady' condition for the node
415
+ ginkgo .GinkgoWriter .Printf ("Step 1: Changing status to NotReady for Node:: %s\n " , nodeName )
416
+ retryErr := retry .RetryOnConflict (retry .DefaultRetry , func () error {
417
+ key := types.NamespacedName {Name : nodeName }
418
+ n := & corev1.Node {}
419
+ if err := kubeClient .Get (ctx , key , n ); err != nil {
420
+ return err
421
+ }
422
+
423
+ updated := false
424
+ for i , cond := range n .Status .Conditions {
425
+ if cond .Type == corev1 .NodeReady {
426
+ n .Status .Conditions [i ].Status = corev1 .ConditionFalse
427
+ updated = true
428
+ break
429
+ }
430
+ }
431
+ if ! updated {
432
+ n .Status .Conditions = append (n .Status .Conditions , corev1.NodeCondition {
433
+ Type : corev1 .NodeReady ,
434
+ Status : corev1 .ConditionFalse ,
435
+ LastHeartbeatTime : metav1 .Now (),
436
+ LastTransitionTime : metav1 .Now (),
437
+ })
438
+ }
439
+
440
+ return kubeClient .Status ().Update (ctx , n )
441
+ })
442
+ Expect (retryErr ).NotTo (HaveOccurred (), "Failed to update Node to simulate NotReady condition" )
443
+
444
+ // Wait for fallback logic to take effect
445
+ ginkgo .GinkgoWriter .Println ("Step 2: Node set to NotReady. Triggering PagerDuty Alert. Waiting....." )
446
+
447
+ _ , err = testPdClient .TriggerIncident ("MachineHealthCheckUnterminatedShortCircuitSRE" , clusterID )
448
+ Expect (err ).NotTo (HaveOccurred (), "Failed to trigger silent PagerDuty alert" )
449
+
450
+ time .Sleep (5 * time .Second )
451
+
452
+ // Polling every 30 seconds to check if the original number of nodes are in Ready state and not SchedulingDisabled
453
+ checkInterval := 30 * time .Second
454
+ timeout := 6 * time .Minute // Total time to wait for nodes to be Ready
455
+
456
+ startTime := time .Now ()
457
+ for {
458
+ // List all nodes in the cluster
459
+ nodeList := & corev1.NodeList {}
460
+ err = kubeClient .List (ctx , nodeList , & pclient.ListOptions {})
461
+ Expect (err ).NotTo (HaveOccurred (), "Failed to list nodes" )
462
+
463
+ // Check if the number of nodes is back to the original count
464
+ currentNodeCount := len (nodeList .Items )
465
+ if currentNodeCount > originalNodeCount {
466
+ ginkgo .GinkgoWriter .Printf ("Step 3: Found %d nodes, waiting for node count to match original %d...\n " , currentNodeCount , originalNodeCount )
467
+ }
468
+
469
+ // Counting ready nodes and checking if all are in the Ready state (and not SchedulingDisabled)
470
+ readyNodeCount := 0
471
+ for _ , n := range nodeList .Items {
472
+ isReady := false
473
+ // Check if node is Ready
474
+ for _ , cond := range n .Status .Conditions {
475
+ if cond .Type == corev1 .NodeReady && cond .Status == corev1 .ConditionTrue {
476
+ isReady = true
477
+ break
478
+ }
479
+ }
480
+
481
+ // Check if node is NOT SchedulingDisabled
482
+ if isReady && ! n .Spec .Unschedulable {
483
+ readyNodeCount ++
484
+ }
485
+ }
486
+
487
+ // Log node status and count after every check
488
+ ginkgo .GinkgoWriter .Printf ("Step 4: Node status checked. Ready Node count: %d\n " , readyNodeCount )
489
+
490
+ if readyNodeCount == originalNodeCount && currentNodeCount == originalNodeCount {
491
+ ginkgo .GinkgoWriter .Println ("Step 5: All nodes are in Ready state and not SchedulingDisabled. Test passed." )
492
+ break
493
+ }
494
+
495
+ if time .Since (startTime ) > timeout {
496
+ Expect (readyNodeCount ).To (Equal (originalNodeCount ), "Timed out waiting for all nodes to become Ready." )
497
+ break
498
+ }
499
+
500
+ // If not all nodes are ready or if there are more nodes, wait for the next interval
501
+ ginkgo .GinkgoWriter .Printf ("Step 6: Not all nodes are Ready or node count mismatch. Waiting for %s...\n " , checkInterval )
502
+ time .Sleep (checkInterval )
503
+ }
504
+
505
+ ginkgo .GinkgoWriter .Println ("Step 7: Test completed: Node NotReady condition simulated and checked." )
506
+ }
507
+ })
508
+
382
509
}, ginkgo .ContinueOnFailure )
0 commit comments