Skip to content

Commit fc8ba3f

Browse files
Merge pull request #476 from ratnam915/feature/OSD-30036
OSD-30036: To create E2E Test for CAD - Cluster has gone missing - UpgradeConfigSyncFailureOver4Hr
2 parents ce86894 + c948870 commit fc8ba3f

File tree

3 files changed

+213
-26
lines changed

3 files changed

+213
-26
lines changed

test/e2e/configuration_anomaly_detection_test.go

Lines changed: 133 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package osde2etests
55

66
import (
77
"context"
8+
"encoding/json"
89
"fmt"
910
"log"
1011
"os"
@@ -20,7 +21,6 @@ import (
2021
v1beta1 "github.com/openshift/api/machine/v1beta1"
2122
awsinternal "github.com/openshift/configuration-anomaly-detection/pkg/aws"
2223
machineutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/machine"
23-
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
2424
"github.com/openshift/configuration-anomaly-detection/test/e2e/utils"
2525
ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm"
2626
"github.com/openshift/osde2e-common/pkg/clients/openshift"
@@ -36,7 +36,6 @@ import (
3636
var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
3737
var (
3838
ocme2eCli *ocme2e.Client
39-
ocmCli ocm.Client
4039
k8s *openshift.Client
4140
region string
4241
provider string
@@ -52,18 +51,13 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
5251
clientID := os.Getenv("CLIENT_ID")
5352
clientSecret := os.Getenv("CLIENT_SECRET")
5453
clusterID = os.Getenv("OCM_CLUSTER_ID")
55-
cadOcmFilePath := os.Getenv("CAD_OCM_FILE_PATH")
5654

5755
Expect(ocmToken).NotTo(BeEmpty(), "OCM_TOKEN must be set")
5856
Expect(clusterID).NotTo(BeEmpty(), "CLUSTER_ID must be set")
59-
Expect(cadOcmFilePath).NotTo(BeEmpty(), "CAD_OCM_FILE_PATH must be set")
6057

6158
ocme2eCli, err = ocme2e.New(ctx, ocmToken, clientID, clientSecret, ocmEnv)
6259
Expect(err).ShouldNot(HaveOccurred(), "Unable to setup E2E OCM Client")
6360

64-
ocmCli, err = ocm.New(cadOcmFilePath)
65-
Expect(err).ShouldNot(HaveOccurred(), "Unable to setup ocm anomaly detection client")
66-
6761
k8s, err = openshift.New(ginkgo.GinkgoLogr)
6862
Expect(err).ShouldNot(HaveOccurred(), "Unable to setup k8s client")
6963

@@ -165,7 +159,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
165159
Expect(cluster).ToNot(BeNil(), "received nil cluster from OCM")
166160

167161
// Get service logs
168-
logs, err := utils.GetServiceLogs(ocmCli, cluster)
162+
logs, err := utils.GetServiceLogs(ocme2eCli, cluster)
169163
Expect(err).ToNot(HaveOccurred(), "Failed to get service logs")
170164
logsBefore := logs.Items().Slice()
171165

@@ -228,7 +222,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
228222

229223
time.Sleep(1 * time.Minute)
230224

231-
logs, err = utils.GetServiceLogs(ocmCli, cluster)
225+
logs, err = utils.GetServiceLogs(ocme2eCli, cluster)
232226
Expect(err).ToNot(HaveOccurred(), "Failed to get service logs")
233227
logsAfter := logs.Items().Slice()
234228

@@ -505,8 +499,8 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
505499
ginkgo.GinkgoWriter.Println("Step 7: Test completed: Node NotReady condition simulated and checked.")
506500
}
507501
})
508-
509-
It("AWS CCS: clustermonitoringerrorbudgetburn", func(ctx context.Context) {
502+
503+
It("AWS CCS: clustermonitoringerrorbudgetburn", func(ctx context.Context) {
510504
if provider == "aws" {
511505
const (
512506
namespace = "openshift-user-workload-monitoring"
@@ -520,7 +514,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
520514
Expect(cluster).ToNot(BeNil(), "Cluster response is nil")
521515

522516
fmt.Println("Step 1: Getting service logs before misconfiguration")
523-
logs, err := utils.GetServiceLogs(ocmCli, cluster)
517+
logs, err := utils.GetServiceLogs(ocme2eCli, cluster)
524518
Expect(err).ToNot(HaveOccurred(), "Failed to fetch service logs before misconfig")
525519
logsBefore := logs.Items().Slice()
526520

@@ -574,15 +568,15 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
574568
time.Sleep(2 * time.Minute)
575569

576570
fmt.Println("Step 5: Fetching service logs after misconfiguration")
577-
logs, err = utils.GetServiceLogs(ocmCli, cluster)
571+
logs, err = utils.GetServiceLogs(ocme2eCli, cluster)
578572
Expect(err).ToNot(HaveOccurred(), "Failed to get service logs")
579573
logsAfter := logs.Items().Slice()
580574

581575
Expect(logsAfter).To(HaveLen(len(logsBefore)), "Service logs count changed after scale down/up")
582-
}
583-
})
584-
585-
It("AWS CCS: InsightsOperatorDown (blocked egress)", Label("aws", "ccs", "insights-operator", "blocking-egress"), func(ctx context.Context) {
576+
}
577+
})
578+
579+
It("AWS CCS: InsightsOperatorDown (blocked egress)", Label("aws", "ccs", "insights-operator", "blocking-egress"), func(ctx context.Context) {
586580
if provider == "aws" {
587581
awsAccessKey := os.Getenv("AWS_ACCESS_KEY_ID")
588582
awsSecretKey := os.Getenv("AWS_SECRET_ACCESS_KEY")
@@ -616,7 +610,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
616610
Expect(err).NotTo(HaveOccurred(), "Failed to get security group ID")
617611

618612
// Step 1: Get logs before action
619-
logsBefore, err := utils.GetServiceLogs(ocmCli, cluster)
613+
logsBefore, err := utils.GetServiceLogs(ocme2eCli, cluster)
620614
Expect(err).ToNot(HaveOccurred(), "Failed to get service logs before action")
621615

622616
existingLogIDs := map[string]bool{}
@@ -656,8 +650,10 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
656650
_, err = testPdClient.TriggerIncident("InsightsOperatorDown", clusterID)
657651
Expect(err).NotTo(HaveOccurred(), "Failed to trigger silent PagerDuty alert")
658652

653+
time.Sleep(2 * time.Minute)
654+
659655
// Step 4: Get logs again and find new entries
660-
logsAfter, err := utils.GetServiceLogs(ocmCli, cluster)
656+
logsAfter, err := utils.GetServiceLogs(ocme2eCli, cluster)
661657
Expect(err).ToNot(HaveOccurred(), "Failed to get service logs after action")
662658

663659
newLogs := []interface{}{}
@@ -672,4 +668,122 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
672668
}
673669
})
674670

671+
It("UpgradeConfigSyncFailureOver4Hr: corrupted pull secret investigation", Label("pull-secret", "upgrade-config-sync", "user-banned-check"), func(ctx context.Context) {
672+
// Get cluster information from OCM
673+
response, err := ocme2eCli.ClustersMgmt().V1().Clusters().Cluster(clusterID).Get().Send()
674+
Expect(err).ToNot(HaveOccurred(), "failed to get cluster from OCM")
675+
cluster := response.Body()
676+
Expect(cluster).ToNot(BeNil(), "received nil cluster from OCM")
677+
678+
lsResponseBefore, err := utils.GetLimitedSupportReasons(ocme2eCli, clusterID)
679+
var lsReasonsBefore int
680+
if err != nil {
681+
ginkgo.GinkgoWriter.Printf("Could not get limited support reasons before test: %v\n", err)
682+
lsReasonsBefore = 0
683+
} else {
684+
lsReasonsBefore = lsResponseBefore.Items().Len()
685+
ginkgo.GinkgoWriter.Printf("Limited support reasons before pull secret corruption %d\n", lsReasonsBefore)
686+
}
687+
688+
// Get the original pull secret for backup
689+
var originalPullSecret corev1.Secret
690+
err = k8s.Get(ctx, "pull-secret", "openshift-config", &originalPullSecret)
691+
Expect(err).NotTo(HaveOccurred(), "Failed to get original pull secret")
692+
ginkgo.GinkgoWriter.Print("Original pull secret retrieved successfully\n")
693+
694+
// Setup deferred restoration to ensure pull secret is restored regardless of test outcome
695+
defer func() {
696+
ginkgo.GinkgoWriter.Print("Restoring original pull secret...\n")
697+
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
698+
currentSecret := &corev1.Secret{}
699+
err := k8s.Get(ctx, "pull-secret", "openshift-config", currentSecret)
700+
if err != nil {
701+
return err
702+
}
703+
// Restore original data
704+
currentSecret.Data = originalPullSecret.Data
705+
return k8s.Update(ctx, currentSecret)
706+
})
707+
if err != nil {
708+
ginkgo.GinkgoWriter.Print("Failed to restore original pull secret: %v\n", err)
709+
} else {
710+
ginkgo.GinkgoWriter.Print("Original pull secret restored successfully\n")
711+
}
712+
}()
713+
714+
// Corrupt the pull secret to simulate the UpgradeConfigSyncFailure scenario
715+
ginkgo.GinkgoWriter.Print("Corrupting pull secret to simulate sync failure...\n")
716+
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
717+
pullSecret := &corev1.Secret{}
718+
err := k8s.Get(ctx, "pull-secret", "openshift-config", pullSecret)
719+
if err != nil {
720+
return err
721+
}
722+
723+
// Create a corrupted docker config json
724+
corruptedConfig := map[string]interface{}{
725+
"auths": map[string]interface{}{
726+
"cloud.openshift.com": map[string]interface{}{
727+
"auth": "Y29ycnVwdGVkX3Rva2VuOmNvcnJ1cHRlZF9wYXNzd29yZA==",
728+
"email": "[email protected]",
729+
},
730+
"registry.connect.redhat.com": map[string]interface{}{
731+
"auth": "Y29ycnVwdGVkX3Rva2VuOmNvcnJ1cHRlZF9wYXNzd29yZA==",
732+
"email": "[email protected]",
733+
},
734+
},
735+
}
736+
737+
corruptedConfigBytes, err := json.Marshal(corruptedConfig)
738+
if err != nil {
739+
return err
740+
}
741+
742+
// Update the pull secret with corrupted data
743+
pullSecret.Data[".dockerconfigjson"] = corruptedConfigBytes
744+
return k8s.Update(ctx, pullSecret)
745+
})
746+
Expect(err).NotTo(HaveOccurred(), "Failed to corrupt pull secret")
747+
ginkgo.GinkgoWriter.Print("Pull secret corrupted successfully\n")
748+
749+
// Trigger the UpgradeConfigSyncFailureOver4Hr alert
750+
_, err = testPdClient.TriggerIncident("UpgradeConfigSyncFailureOver4HrSRE", clusterID)
751+
Expect(err).NotTo(HaveOccurred(), "Failed to trigger UpgradeConfigSyncFailureOver4Hr PagerDuty alert")
752+
753+
// Wait for the investigation to process
754+
ginkgo.GinkgoWriter.Print("Waiting for investigation to process corrupted pull secret...\n")
755+
time.Sleep(2 * time.Minute)
756+
757+
// Get limited support reasons after corruption
758+
lsResponseAfter, err := utils.GetLimitedSupportReasons(ocme2eCli, clusterID)
759+
if err != nil {
760+
ginkgo.GinkgoWriter.Printf("Could not get limited support reasons after test: %v\n", err)
761+
} else {
762+
// Print the response data
763+
fmt.Println("Limited Support Response After Pull Secret Corruption:")
764+
fmt.Printf("Total items: %d\n", lsResponseAfter.Items().Len())
765+
766+
// Iterate through each item and print details
767+
items := lsResponseAfter.Items().Slice()
768+
for i, item := range items {
769+
fmt.Printf("Reason #%d:\n", i+1)
770+
fmt.Printf(" - Summary: %s\n", item.Summary())
771+
fmt.Printf(" - Details: %s\n", item.Details())
772+
}
773+
774+
// Compare with before if we had baseline data
775+
if lsReasonsBefore >= 0 {
776+
if lsResponseAfter.Items().Len() > lsReasonsBefore {
777+
ginkgo.GinkgoWriter.Printf("Limited support reasons increased from %d to %d\n",
778+
lsReasonsBefore, lsResponseAfter.Items().Len())
779+
} else {
780+
ginkgo.GinkgoWriter.Printf("Limited support reasons remained at %d\n",
781+
lsResponseAfter.Items().Len())
782+
}
783+
}
784+
}
785+
786+
fmt.Println("Test completed: UpgradeConfigSyncFailureOver4Hr investigation simulated successfully")
787+
})
788+
675789
}, ginkgo.ContinueOnFailure)

test/e2e/utils/generate_incident.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ const (
1616
AlertInsightsOperatorDown = "InsightsOperatorDown"
1717
AlertMachineHealthCheckUnterminatedShortCircuitSRE = "MachineHealthCheckUnterminatedShortCircuitSRE"
1818
AlertApiErrorBudgetBurn = "ApiErrorBudgetBurn"
19+
UpgradeConfigSyncFailureOver4HrSRE = "UpgradeConfigSyncFailureOver4HrSRE"
1920
)
2021

2122
func GetAlertTitle(alertName string) (string, error) {
@@ -32,6 +33,8 @@ func GetAlertTitle(alertName string) (string, error) {
3233
return "MachineHealthCheckUnterminatedShortCircuitSRE CRITICAL (1)", nil
3334
case AlertApiErrorBudgetBurn:
3435
return "api-ErrorBudgetBurn k8sgpt test CRITICAL (1)", nil
36+
case UpgradeConfigSyncFailureOver4HrSRE:
37+
return "UpgradeConfigSyncFailureOver4HrSRE Critical (1)", nil
3538
default:
3639
return "", fmt.Errorf("unknown alert name: %s", alertName)
3740
}

test/e2e/utils/utils.go

Lines changed: 77 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@ package utils
66
import (
77
"fmt"
88

9+
sdk "github.com/openshift-online/ocm-sdk-go"
10+
amv1 "github.com/openshift-online/ocm-sdk-go/accountsmgmt/v1"
911
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
1012
servicelogsv1 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1"
13+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
1114
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
1215
ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm"
1316
"k8s.io/client-go/tools/clientcmd"
@@ -16,24 +19,91 @@ import (
1619

1720
func GetLimitedSupportReasons(ocme2eCli *ocme2e.Client, clusterID string) (*cmv1.LimitedSupportReasonsListResponse, error) {
1821
lsResponse, err := ocme2eCli.ClustersMgmt().V1().Clusters().Cluster(clusterID).LimitedSupportReasons().List().Send()
19-
2022
if err != nil {
2123
return nil, fmt.Errorf("failed sending service log: %w", err)
2224
}
2325
return lsResponse, nil
2426
}
2527

26-
func GetServiceLogs(ocmCli ocm.Client, cluster *cmv1.Cluster) (*servicelogsv1.ClusterLogsUUIDListResponse, error) {
27-
filter := "log_type='cluster-state-updates'"
28-
clusterLogsUUIDListResponse, err := ocmCli.GetServiceLog(cluster, filter)
28+
func GetServiceLogs(ocmLike interface{}, cluster *cmv1.Cluster) (*servicelogsv1.ClusterLogsUUIDListResponse, error) {
29+
const filter = "log_type='cluster-state-updates'"
30+
31+
switch v := ocmLike.(type) {
32+
case ocm.Client:
33+
clusterLogsUUIDListResponse, err := v.GetServiceLog(cluster, filter)
34+
if err != nil {
35+
return nil, fmt.Errorf("Failed to get service log: %w", err)
36+
}
37+
return clusterLogsUUIDListResponse, nil
38+
case *ocme2e.Client:
39+
adapter := &e2eOCMAdapter{conn: v.Connection}
40+
clusterLogsUUIDListResponse, err := adapter.GetServiceLog(cluster, filter)
41+
if err != nil {
42+
return nil, fmt.Errorf("Failed to get service log (via adapter): %w", err)
43+
}
44+
return clusterLogsUUIDListResponse, nil
45+
default:
46+
return nil, fmt.Errorf("unsupported type for GetServiceLogs: %T", v)
47+
}
48+
}
49+
50+
type e2eOCMAdapter struct {
51+
conn *sdk.Connection
52+
}
53+
54+
func (a *e2eOCMAdapter) GetServiceLog(cluster *cmv1.Cluster, filter string) (*servicelogsv1.ClusterLogsUUIDListResponse, error) {
55+
if filter != "" {
56+
return a.conn.ServiceLogs().V1().Clusters().Cluster(cluster.ExternalID()).ClusterLogs().List().Search(filter).Send()
57+
}
58+
return a.conn.ServiceLogs().V1().Clusters().Cluster(cluster.ExternalID()).ClusterLogs().List().Send()
59+
}
60+
61+
// === IS USER BANNED (uses ocme2eCli.Connection) ===
62+
func IsUserBanned(ocme2eCli *ocme2e.Client, cluster *cmv1.Cluster) (bool, string, error) {
63+
conn := ocme2eCli.Connection
64+
user, err := getCreatorFromCluster(conn, cluster)
65+
if err != nil {
66+
return false, "encountered an issue when checking if the cluster owner is banned. Please investigate.", err
67+
}
68+
69+
if user.Banned() {
70+
noteMessage := fmt.Sprintf("User is banned %s. Ban description %s.\n Please open a proactive case, so that MCS can resolve the ban or organize an ownership transfer.", user.BanCode(), user.BanDescription())
71+
logging.Warnf(noteMessage)
72+
return true, noteMessage, nil
73+
}
74+
return false, "User is not banned.", nil
75+
}
76+
77+
func getCreatorFromCluster(conn *sdk.Connection, cluster *cmv1.Cluster) (*amv1.Account, error) {
78+
logging.Debugf("Getting subscription from cluster: %s", cluster.ID())
79+
sub, ok := cluster.GetSubscription()
80+
if !ok {
81+
return nil, fmt.Errorf("failed to get subscription from cluster: %s", cluster.ID())
82+
}
83+
subResp, err := conn.AccountsMgmt().V1().Subscriptions().Subscription(sub.ID()).Get().Send()
2984
if err != nil {
30-
return nil, fmt.Errorf("Failed to get service log: %w", err)
85+
return nil, err
86+
}
87+
subscription, ok := subResp.GetBody()
88+
if !ok {
89+
return nil, fmt.Errorf("failed to get subscription body")
90+
}
91+
if subscription.Status() != "Active" {
92+
return nil, fmt.Errorf("expecting status 'Active' found %v", subscription.Status())
93+
}
94+
accResp, err := conn.AccountsMgmt().V1().Accounts().Account(subscription.Creator().ID()).Get().Send()
95+
if err != nil {
96+
return nil, err
97+
}
98+
account, ok := accResp.GetBody()
99+
if !ok {
100+
return nil, fmt.Errorf("failed to get account body")
31101
}
32-
return clusterLogsUUIDListResponse, nil
102+
return account, nil
33103
}
34104

105+
// === CREATE CLIENT FROM KUBECONFIG ===
35106
func CreateClientFromKubeConfig(kubeConfigPath string) (pclient.Client, error) {
36-
// Load kubeconfig file and create a client
37107
cfg, err := clientcmd.BuildConfigFromFlags("", kubeConfigPath)
38108
if err != nil {
39109
return nil, fmt.Errorf("failed to build kubeconfig: %v", err)

0 commit comments

Comments
 (0)