added end to end test case for MachineHealthCheckUnterminatedShortCircuitSRE (#457)

drakshakhan · web-flow · commit 188179983464 · 2025-06-05T06:55:39.000Z
diff --git a/test/e2e/configuration_anomaly_detection_test.go b/test/e2e/configuration_anomaly_detection_test.go
@@ -6,6 +6,7 @@ package osde2etests
 import (
 	"context"
 	"fmt"
+	"log"
 	"os"
 	"strings"
 	"time"
@@ -16,14 +17,19 @@ import (
 	"github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
+	v1beta1 "github.com/openshift/api/machine/v1beta1"
 	awsinternal "github.com/openshift/configuration-anomaly-detection/pkg/aws"
+	machineutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/machine"
 	"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
 	"github.com/openshift/configuration-anomaly-detection/test/e2e/utils"
 	ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm"
 	"github.com/openshift/osde2e-common/pkg/clients/openshift"
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/util/retry"
+	pclient "sigs.k8s.io/controller-runtime/pkg/client"
 	logger "sigs.k8s.io/controller-runtime/pkg/log"
 )
 
@@ -127,7 +133,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
 			_, err = testPdClient.TriggerIncident("ClusterHasGoneMissing", clusterID)
 			Expect(err).NotTo(HaveOccurred(), "Failed to trigger silent PagerDuty alert")
 
-			time.Sleep(3 * time.Minute)
+			time.Sleep(5 * time.Minute)
 
 			lsResponseAfter, err := utils.GetLimitedSupportReasons(ocme2eCli, clusterID)
 			Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons")
@@ -379,4 +385,125 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
 		Expect(lsResponseAfter.Items().Len()).To(BeNumerically(">", lsReasonsBefore),
 			"Expected more limited support reasons after infrastructure node shutdown")
 	})
+
+	It("AWS CCS: MachineHealthCheckUnterminatedShortCircuitSRE - node is NotReady", func(ctx context.Context) {
+		if provider == "aws" {
+			kubeConfigPath := os.Getenv("KUBECONFIG")
+			kubeClient, err := utils.CreateClientFromKubeConfig(kubeConfigPath)
+			if err != nil {
+				log.Fatalf("Error creating Kubernetes client: %v", err)
+			}
+
+			// Fetch machine list in the 'openshift-machine-api' namespace
+			machineList := &v1beta1.MachineList{}
+			err = kubeClient.List(context.TODO(), machineList, &pclient.ListOptions{
+				Namespace: machineutil.MachineNamespace,
+			})
+			Expect(err).ToNot(HaveOccurred(), "Failed to list machines")
+
+			// Get nodes for the first machine
+			machine := &machineList.Items[0]
+			node, err := machineutil.GetNodeForMachine(ctx, kubeClient, *machine)
+			Expect(err).NotTo(HaveOccurred(), "Failed to get Node for Machine")
+			Expect(node).NotTo(BeNil(), "Node for Machine is nil")
+
+			nodeName := node.Name
+			originalNodeCount := len(machineList.Items)
+			ginkgo.GinkgoWriter.Printf("Original node count: %d\n", originalNodeCount)
+
+			// Simulate 'NotReady' condition for the node
+			ginkgo.GinkgoWriter.Printf("Step 1: Changing status to NotReady for Node:: %s\n", nodeName)
+			retryErr := retry.RetryOnConflict(retry.DefaultRetry, func() error {
+				key := types.NamespacedName{Name: nodeName}
+				n := &corev1.Node{}
+				if err := kubeClient.Get(ctx, key, n); err != nil {
+					return err
+				}
+
+				updated := false
+				for i, cond := range n.Status.Conditions {
+					if cond.Type == corev1.NodeReady {
+						n.Status.Conditions[i].Status = corev1.ConditionFalse
+						updated = true
+						break
+					}
+				}
+				if !updated {
+					n.Status.Conditions = append(n.Status.Conditions, corev1.NodeCondition{
+						Type:               corev1.NodeReady,
+						Status:             corev1.ConditionFalse,
+						LastHeartbeatTime:  metav1.Now(),
+						LastTransitionTime: metav1.Now(),
+					})
+				}
+
+				return kubeClient.Status().Update(ctx, n)
+			})
+			Expect(retryErr).NotTo(HaveOccurred(), "Failed to update Node to simulate NotReady condition")
+
+			// Wait for fallback logic to take effect
+			ginkgo.GinkgoWriter.Println("Step 2: Node set to NotReady. Triggering PagerDuty Alert. Waiting.....")
+
+			_, err = testPdClient.TriggerIncident("MachineHealthCheckUnterminatedShortCircuitSRE", clusterID)
+			Expect(err).NotTo(HaveOccurred(), "Failed to trigger silent PagerDuty alert")
+
+			time.Sleep(5 * time.Second)
+
+			// Polling every 30 seconds to check if the original number of nodes are in Ready state and not SchedulingDisabled
+			checkInterval := 30 * time.Second
+			timeout := 6 * time.Minute // Total time to wait for nodes to be Ready
+
+			startTime := time.Now()
+			for {
+				// List all nodes in the cluster
+				nodeList := &corev1.NodeList{}
+				err = kubeClient.List(ctx, nodeList, &pclient.ListOptions{})
+				Expect(err).NotTo(HaveOccurred(), "Failed to list nodes")
+
+				// Check if the number of nodes is back to the original count
+				currentNodeCount := len(nodeList.Items)
+				if currentNodeCount > originalNodeCount {
+					ginkgo.GinkgoWriter.Printf("Step 3: Found %d nodes, waiting for node count to match original %d...\n", currentNodeCount, originalNodeCount)
+				}
+
+				// Counting ready nodes and checking if all are in the Ready state (and not SchedulingDisabled)
+				readyNodeCount := 0
+				for _, n := range nodeList.Items {
+					isReady := false
+					// Check if node is Ready
+					for _, cond := range n.Status.Conditions {
+						if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
+							isReady = true
+							break
+						}
+					}
+
+					// Check if node is NOT SchedulingDisabled
+					if isReady && !n.Spec.Unschedulable {
+						readyNodeCount++
+					}
+				}
+
+				// Log node status and count after every check
+				ginkgo.GinkgoWriter.Printf("Step 4: Node status checked. Ready Node count: %d\n", readyNodeCount)
+
+				if readyNodeCount == originalNodeCount && currentNodeCount == originalNodeCount {
+					ginkgo.GinkgoWriter.Println("Step 5: All nodes are in Ready state and not SchedulingDisabled. Test passed.")
+					break
+				}
+
+				if time.Since(startTime) > timeout {
+					Expect(readyNodeCount).To(Equal(originalNodeCount), "Timed out waiting for all nodes to become Ready.")
+					break
+				}
+
+				// If not all nodes are ready or if there are more nodes, wait for the next interval
+				ginkgo.GinkgoWriter.Printf("Step 6: Not all nodes are Ready or node count mismatch. Waiting for %s...\n", checkInterval)
+				time.Sleep(checkInterval)
+			}
+
+			ginkgo.GinkgoWriter.Println("Step 7: Test completed: Node NotReady condition simulated and checked.")
+		}
+	})
+
 }, ginkgo.ContinueOnFailure)
diff --git a/test/e2e/utils/utils.go b/test/e2e/utils/utils.go
@@ -10,6 +10,8 @@ import (
 	servicelogsv1 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1"
 	"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
 	ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm"
+	"k8s.io/client-go/tools/clientcmd"
+	pclient "sigs.k8s.io/controller-runtime/pkg/client"
 )
 
 func GetLimitedSupportReasons(ocme2eCli *ocme2e.Client, clusterID string) (*cmv1.LimitedSupportReasonsListResponse, error) {
@@ -29,3 +31,16 @@ func GetServiceLogs(ocmCli ocm.Client, cluster *cmv1.Cluster) (*servicelogsv1.Cl
 	}
 	return clusterLogsUUIDListResponse, nil
 }
+
+func CreateClientFromKubeConfig(kubeConfigPath string) (pclient.Client, error) {
+	// Load kubeconfig file and create a client
+	cfg, err := clientcmd.BuildConfigFromFlags("", kubeConfigPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to build kubeconfig: %v", err)
+	}
+	cl, err := pclient.New(cfg, pclient.Options{})
+	if err != nil {
+		return nil, fmt.Errorf("failed to create Kubernetes client: %v", err)
+	}
+	return cl, nil
+}