Skip to content

Commit 1881799

Browse files
authored
added end to end test case for MachineHealthCheckUnterminatedShortCircuitSRE (#457)
1 parent a59dd9a commit 1881799

File tree

2 files changed

+143
-1
lines changed

2 files changed

+143
-1
lines changed

test/e2e/configuration_anomaly_detection_test.go

Lines changed: 128 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package osde2etests
66
import (
77
"context"
88
"fmt"
9+
"log"
910
"os"
1011
"strings"
1112
"time"
@@ -16,14 +17,19 @@ import (
1617
"github.com/onsi/ginkgo/v2"
1718
. "github.com/onsi/ginkgo/v2"
1819
. "github.com/onsi/gomega"
20+
v1beta1 "github.com/openshift/api/machine/v1beta1"
1921
awsinternal "github.com/openshift/configuration-anomaly-detection/pkg/aws"
22+
machineutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/machine"
2023
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
2124
"github.com/openshift/configuration-anomaly-detection/test/e2e/utils"
2225
ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm"
2326
"github.com/openshift/osde2e-common/pkg/clients/openshift"
2427
appsv1 "k8s.io/api/apps/v1"
2528
corev1 "k8s.io/api/core/v1"
29+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+
"k8s.io/apimachinery/pkg/types"
2631
"k8s.io/client-go/util/retry"
32+
pclient "sigs.k8s.io/controller-runtime/pkg/client"
2733
logger "sigs.k8s.io/controller-runtime/pkg/log"
2834
)
2935

@@ -127,7 +133,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
127133
_, err = testPdClient.TriggerIncident("ClusterHasGoneMissing", clusterID)
128134
Expect(err).NotTo(HaveOccurred(), "Failed to trigger silent PagerDuty alert")
129135

130-
time.Sleep(3 * time.Minute)
136+
time.Sleep(5 * time.Minute)
131137

132138
lsResponseAfter, err := utils.GetLimitedSupportReasons(ocme2eCli, clusterID)
133139
Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons")
@@ -379,4 +385,125 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
379385
Expect(lsResponseAfter.Items().Len()).To(BeNumerically(">", lsReasonsBefore),
380386
"Expected more limited support reasons after infrastructure node shutdown")
381387
})
388+
389+
It("AWS CCS: MachineHealthCheckUnterminatedShortCircuitSRE - node is NotReady", func(ctx context.Context) {
390+
if provider == "aws" {
391+
kubeConfigPath := os.Getenv("KUBECONFIG")
392+
kubeClient, err := utils.CreateClientFromKubeConfig(kubeConfigPath)
393+
if err != nil {
394+
log.Fatalf("Error creating Kubernetes client: %v", err)
395+
}
396+
397+
// Fetch machine list in the 'openshift-machine-api' namespace
398+
machineList := &v1beta1.MachineList{}
399+
err = kubeClient.List(context.TODO(), machineList, &pclient.ListOptions{
400+
Namespace: machineutil.MachineNamespace,
401+
})
402+
Expect(err).ToNot(HaveOccurred(), "Failed to list machines")
403+
404+
// Get nodes for the first machine
405+
machine := &machineList.Items[0]
406+
node, err := machineutil.GetNodeForMachine(ctx, kubeClient, *machine)
407+
Expect(err).NotTo(HaveOccurred(), "Failed to get Node for Machine")
408+
Expect(node).NotTo(BeNil(), "Node for Machine is nil")
409+
410+
nodeName := node.Name
411+
originalNodeCount := len(machineList.Items)
412+
ginkgo.GinkgoWriter.Printf("Original node count: %d\n", originalNodeCount)
413+
414+
// Simulate 'NotReady' condition for the node
415+
ginkgo.GinkgoWriter.Printf("Step 1: Changing status to NotReady for Node:: %s\n", nodeName)
416+
retryErr := retry.RetryOnConflict(retry.DefaultRetry, func() error {
417+
key := types.NamespacedName{Name: nodeName}
418+
n := &corev1.Node{}
419+
if err := kubeClient.Get(ctx, key, n); err != nil {
420+
return err
421+
}
422+
423+
updated := false
424+
for i, cond := range n.Status.Conditions {
425+
if cond.Type == corev1.NodeReady {
426+
n.Status.Conditions[i].Status = corev1.ConditionFalse
427+
updated = true
428+
break
429+
}
430+
}
431+
if !updated {
432+
n.Status.Conditions = append(n.Status.Conditions, corev1.NodeCondition{
433+
Type: corev1.NodeReady,
434+
Status: corev1.ConditionFalse,
435+
LastHeartbeatTime: metav1.Now(),
436+
LastTransitionTime: metav1.Now(),
437+
})
438+
}
439+
440+
return kubeClient.Status().Update(ctx, n)
441+
})
442+
Expect(retryErr).NotTo(HaveOccurred(), "Failed to update Node to simulate NotReady condition")
443+
444+
// Wait for fallback logic to take effect
445+
ginkgo.GinkgoWriter.Println("Step 2: Node set to NotReady. Triggering PagerDuty Alert. Waiting.....")
446+
447+
_, err = testPdClient.TriggerIncident("MachineHealthCheckUnterminatedShortCircuitSRE", clusterID)
448+
Expect(err).NotTo(HaveOccurred(), "Failed to trigger silent PagerDuty alert")
449+
450+
time.Sleep(5 * time.Second)
451+
452+
// Polling every 30 seconds to check if the original number of nodes are in Ready state and not SchedulingDisabled
453+
checkInterval := 30 * time.Second
454+
timeout := 6 * time.Minute // Total time to wait for nodes to be Ready
455+
456+
startTime := time.Now()
457+
for {
458+
// List all nodes in the cluster
459+
nodeList := &corev1.NodeList{}
460+
err = kubeClient.List(ctx, nodeList, &pclient.ListOptions{})
461+
Expect(err).NotTo(HaveOccurred(), "Failed to list nodes")
462+
463+
// Check if the number of nodes is back to the original count
464+
currentNodeCount := len(nodeList.Items)
465+
if currentNodeCount > originalNodeCount {
466+
ginkgo.GinkgoWriter.Printf("Step 3: Found %d nodes, waiting for node count to match original %d...\n", currentNodeCount, originalNodeCount)
467+
}
468+
469+
// Counting ready nodes and checking if all are in the Ready state (and not SchedulingDisabled)
470+
readyNodeCount := 0
471+
for _, n := range nodeList.Items {
472+
isReady := false
473+
// Check if node is Ready
474+
for _, cond := range n.Status.Conditions {
475+
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
476+
isReady = true
477+
break
478+
}
479+
}
480+
481+
// Check if node is NOT SchedulingDisabled
482+
if isReady && !n.Spec.Unschedulable {
483+
readyNodeCount++
484+
}
485+
}
486+
487+
// Log node status and count after every check
488+
ginkgo.GinkgoWriter.Printf("Step 4: Node status checked. Ready Node count: %d\n", readyNodeCount)
489+
490+
if readyNodeCount == originalNodeCount && currentNodeCount == originalNodeCount {
491+
ginkgo.GinkgoWriter.Println("Step 5: All nodes are in Ready state and not SchedulingDisabled. Test passed.")
492+
break
493+
}
494+
495+
if time.Since(startTime) > timeout {
496+
Expect(readyNodeCount).To(Equal(originalNodeCount), "Timed out waiting for all nodes to become Ready.")
497+
break
498+
}
499+
500+
// If not all nodes are ready or if there are more nodes, wait for the next interval
501+
ginkgo.GinkgoWriter.Printf("Step 6: Not all nodes are Ready or node count mismatch. Waiting for %s...\n", checkInterval)
502+
time.Sleep(checkInterval)
503+
}
504+
505+
ginkgo.GinkgoWriter.Println("Step 7: Test completed: Node NotReady condition simulated and checked.")
506+
}
507+
})
508+
382509
}, ginkgo.ContinueOnFailure)

test/e2e/utils/utils.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ import (
1010
servicelogsv1 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1"
1111
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
1212
ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm"
13+
"k8s.io/client-go/tools/clientcmd"
14+
pclient "sigs.k8s.io/controller-runtime/pkg/client"
1315
)
1416

1517
func GetLimitedSupportReasons(ocme2eCli *ocme2e.Client, clusterID string) (*cmv1.LimitedSupportReasonsListResponse, error) {
@@ -29,3 +31,16 @@ func GetServiceLogs(ocmCli ocm.Client, cluster *cmv1.Cluster) (*servicelogsv1.Cl
2931
}
3032
return clusterLogsUUIDListResponse, nil
3133
}
34+
35+
func CreateClientFromKubeConfig(kubeConfigPath string) (pclient.Client, error) {
36+
// Load kubeconfig file and create a client
37+
cfg, err := clientcmd.BuildConfigFromFlags("", kubeConfigPath)
38+
if err != nil {
39+
return nil, fmt.Errorf("failed to build kubeconfig: %v", err)
40+
}
41+
cl, err := pclient.New(cfg, pclient.Options{})
42+
if err != nil {
43+
return nil, fmt.Errorf("failed to create Kubernetes client: %v", err)
44+
}
45+
return cl, nil
46+
}

0 commit comments

Comments
 (0)