@@ -5,6 +5,7 @@ package osde2etests
5
5
6
6
import (
7
7
"context"
8
+ "encoding/json"
8
9
"fmt"
9
10
"log"
10
11
"os"
@@ -20,7 +21,6 @@ import (
20
21
v1beta1 "github.com/openshift/api/machine/v1beta1"
21
22
awsinternal "github.com/openshift/configuration-anomaly-detection/pkg/aws"
22
23
machineutil "github.com/openshift/configuration-anomaly-detection/pkg/investigations/utils/machine"
23
- "github.com/openshift/configuration-anomaly-detection/pkg/ocm"
24
24
"github.com/openshift/configuration-anomaly-detection/test/e2e/utils"
25
25
ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm"
26
26
"github.com/openshift/osde2e-common/pkg/clients/openshift"
@@ -36,7 +36,6 @@ import (
36
36
var _ = Describe ("Configuration Anomaly Detection" , Ordered , func () {
37
37
var (
38
38
ocme2eCli * ocme2e.Client
39
- ocmCli ocm.Client
40
39
k8s * openshift.Client
41
40
region string
42
41
provider string
@@ -52,18 +51,13 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
52
51
clientID := os .Getenv ("CLIENT_ID" )
53
52
clientSecret := os .Getenv ("CLIENT_SECRET" )
54
53
clusterID = os .Getenv ("OCM_CLUSTER_ID" )
55
- cadOcmFilePath := os .Getenv ("CAD_OCM_FILE_PATH" )
56
54
57
55
Expect (ocmToken ).NotTo (BeEmpty (), "OCM_TOKEN must be set" )
58
56
Expect (clusterID ).NotTo (BeEmpty (), "CLUSTER_ID must be set" )
59
- Expect (cadOcmFilePath ).NotTo (BeEmpty (), "CAD_OCM_FILE_PATH must be set" )
60
57
61
58
ocme2eCli , err = ocme2e .New (ctx , ocmToken , clientID , clientSecret , ocmEnv )
62
59
Expect (err ).ShouldNot (HaveOccurred (), "Unable to setup E2E OCM Client" )
63
60
64
- ocmCli , err = ocm .New (cadOcmFilePath )
65
- Expect (err ).ShouldNot (HaveOccurred (), "Unable to setup ocm anomaly detection client" )
66
-
67
61
k8s , err = openshift .New (ginkgo .GinkgoLogr )
68
62
Expect (err ).ShouldNot (HaveOccurred (), "Unable to setup k8s client" )
69
63
@@ -165,7 +159,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
165
159
Expect (cluster ).ToNot (BeNil (), "received nil cluster from OCM" )
166
160
167
161
// Get service logs
168
- logs , err := utils .GetServiceLogs (ocmCli , cluster )
162
+ logs , err := utils .GetServiceLogs (ocme2eCli , cluster )
169
163
Expect (err ).ToNot (HaveOccurred (), "Failed to get service logs" )
170
164
logsBefore := logs .Items ().Slice ()
171
165
@@ -228,7 +222,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
228
222
229
223
time .Sleep (1 * time .Minute )
230
224
231
- logs , err = utils .GetServiceLogs (ocmCli , cluster )
225
+ logs , err = utils .GetServiceLogs (ocme2eCli , cluster )
232
226
Expect (err ).ToNot (HaveOccurred (), "Failed to get service logs" )
233
227
logsAfter := logs .Items ().Slice ()
234
228
@@ -505,8 +499,8 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
505
499
ginkgo .GinkgoWriter .Println ("Step 7: Test completed: Node NotReady condition simulated and checked." )
506
500
}
507
501
})
508
-
509
- It ("AWS CCS: clustermonitoringerrorbudgetburn" , func (ctx context.Context ) {
502
+
503
+ It ("AWS CCS: clustermonitoringerrorbudgetburn" , func (ctx context.Context ) {
510
504
if provider == "aws" {
511
505
const (
512
506
namespace = "openshift-user-workload-monitoring"
@@ -520,7 +514,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
520
514
Expect (cluster ).ToNot (BeNil (), "Cluster response is nil" )
521
515
522
516
fmt .Println ("Step 1: Getting service logs before misconfiguration" )
523
- logs , err := utils .GetServiceLogs (ocmCli , cluster )
517
+ logs , err := utils .GetServiceLogs (ocme2eCli , cluster )
524
518
Expect (err ).ToNot (HaveOccurred (), "Failed to fetch service logs before misconfig" )
525
519
logsBefore := logs .Items ().Slice ()
526
520
@@ -574,15 +568,15 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
574
568
time .Sleep (2 * time .Minute )
575
569
576
570
fmt .Println ("Step 5: Fetching service logs after misconfiguration" )
577
- logs , err = utils .GetServiceLogs (ocmCli , cluster )
571
+ logs , err = utils .GetServiceLogs (ocme2eCli , cluster )
578
572
Expect (err ).ToNot (HaveOccurred (), "Failed to get service logs" )
579
573
logsAfter := logs .Items ().Slice ()
580
574
581
575
Expect (logsAfter ).To (HaveLen (len (logsBefore )), "Service logs count changed after scale down/up" )
582
- }
583
- })
584
-
585
- It ("AWS CCS: InsightsOperatorDown (blocked egress)" , Label ("aws" , "ccs" , "insights-operator" , "blocking-egress" ), func (ctx context.Context ) {
576
+ }
577
+ })
578
+
579
+ It ("AWS CCS: InsightsOperatorDown (blocked egress)" , Label ("aws" , "ccs" , "insights-operator" , "blocking-egress" ), func (ctx context.Context ) {
586
580
if provider == "aws" {
587
581
awsAccessKey := os .Getenv ("AWS_ACCESS_KEY_ID" )
588
582
awsSecretKey := os .Getenv ("AWS_SECRET_ACCESS_KEY" )
@@ -616,7 +610,7 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
616
610
Expect (err ).NotTo (HaveOccurred (), "Failed to get security group ID" )
617
611
618
612
// Step 1: Get logs before action
619
- logsBefore , err := utils .GetServiceLogs (ocmCli , cluster )
613
+ logsBefore , err := utils .GetServiceLogs (ocme2eCli , cluster )
620
614
Expect (err ).ToNot (HaveOccurred (), "Failed to get service logs before action" )
621
615
622
616
existingLogIDs := map [string ]bool {}
@@ -656,8 +650,10 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
656
650
_ , err = testPdClient .TriggerIncident ("InsightsOperatorDown" , clusterID )
657
651
Expect (err ).NotTo (HaveOccurred (), "Failed to trigger silent PagerDuty alert" )
658
652
653
+ time .Sleep (2 * time .Minute )
654
+
659
655
// Step 4: Get logs again and find new entries
660
- logsAfter , err := utils .GetServiceLogs (ocmCli , cluster )
656
+ logsAfter , err := utils .GetServiceLogs (ocme2eCli , cluster )
661
657
Expect (err ).ToNot (HaveOccurred (), "Failed to get service logs after action" )
662
658
663
659
newLogs := []interface {}{}
@@ -672,4 +668,122 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
672
668
}
673
669
})
674
670
671
+ It ("UpgradeConfigSyncFailureOver4Hr: corrupted pull secret investigation" , Label ("pull-secret" , "upgrade-config-sync" , "user-banned-check" ), func (ctx context.Context ) {
672
+ // Get cluster information from OCM
673
+ response , err := ocme2eCli .ClustersMgmt ().V1 ().Clusters ().Cluster (clusterID ).Get ().Send ()
674
+ Expect (err ).ToNot (HaveOccurred (), "failed to get cluster from OCM" )
675
+ cluster := response .Body ()
676
+ Expect (cluster ).ToNot (BeNil (), "received nil cluster from OCM" )
677
+
678
+ lsResponseBefore , err := utils .GetLimitedSupportReasons (ocme2eCli , clusterID )
679
+ var lsReasonsBefore int
680
+ if err != nil {
681
+ ginkgo .GinkgoWriter .Printf ("Could not get limited support reasons before test: %v\n " , err )
682
+ lsReasonsBefore = 0
683
+ } else {
684
+ lsReasonsBefore = lsResponseBefore .Items ().Len ()
685
+ ginkgo .GinkgoWriter .Printf ("Limited support reasons before pull secret corruption %d\n " , lsReasonsBefore )
686
+ }
687
+
688
+ // Get the original pull secret for backup
689
+ var originalPullSecret corev1.Secret
690
+ err = k8s .Get (ctx , "pull-secret" , "openshift-config" , & originalPullSecret )
691
+ Expect (err ).NotTo (HaveOccurred (), "Failed to get original pull secret" )
692
+ ginkgo .GinkgoWriter .Print ("Original pull secret retrieved successfully\n " )
693
+
694
+ // Setup deferred restoration to ensure pull secret is restored regardless of test outcome
695
+ defer func () {
696
+ ginkgo .GinkgoWriter .Print ("Restoring original pull secret...\n " )
697
+ err := retry .RetryOnConflict (retry .DefaultRetry , func () error {
698
+ currentSecret := & corev1.Secret {}
699
+ err := k8s .Get (ctx , "pull-secret" , "openshift-config" , currentSecret )
700
+ if err != nil {
701
+ return err
702
+ }
703
+ // Restore original data
704
+ currentSecret .Data = originalPullSecret .Data
705
+ return k8s .Update (ctx , currentSecret )
706
+ })
707
+ if err != nil {
708
+ ginkgo .GinkgoWriter .Print ("Failed to restore original pull secret: %v\n " , err )
709
+ } else {
710
+ ginkgo .GinkgoWriter .Print ("Original pull secret restored successfully\n " )
711
+ }
712
+ }()
713
+
714
+ // Corrupt the pull secret to simulate the UpgradeConfigSyncFailure scenario
715
+ ginkgo .GinkgoWriter .Print ("Corrupting pull secret to simulate sync failure...\n " )
716
+ err = retry .RetryOnConflict (retry .DefaultRetry , func () error {
717
+ pullSecret := & corev1.Secret {}
718
+ err := k8s .Get (ctx , "pull-secret" , "openshift-config" , pullSecret )
719
+ if err != nil {
720
+ return err
721
+ }
722
+
723
+ // Create a corrupted docker config json
724
+ corruptedConfig := map [string ]interface {}{
725
+ "auths" : map [string ]interface {}{
726
+ "cloud.openshift.com" : map [string ]interface {}{
727
+ "auth" : "Y29ycnVwdGVkX3Rva2VuOmNvcnJ1cHRlZF9wYXNzd29yZA==" ,
728
+
729
+ },
730
+ "registry.connect.redhat.com" : map [string ]interface {}{
731
+ "auth" : "Y29ycnVwdGVkX3Rva2VuOmNvcnJ1cHRlZF9wYXNzd29yZA==" ,
732
+
733
+ },
734
+ },
735
+ }
736
+
737
+ corruptedConfigBytes , err := json .Marshal (corruptedConfig )
738
+ if err != nil {
739
+ return err
740
+ }
741
+
742
+ // Update the pull secret with corrupted data
743
+ pullSecret .Data [".dockerconfigjson" ] = corruptedConfigBytes
744
+ return k8s .Update (ctx , pullSecret )
745
+ })
746
+ Expect (err ).NotTo (HaveOccurred (), "Failed to corrupt pull secret" )
747
+ ginkgo .GinkgoWriter .Print ("Pull secret corrupted successfully\n " )
748
+
749
+ // Trigger the UpgradeConfigSyncFailureOver4Hr alert
750
+ _ , err = testPdClient .TriggerIncident ("UpgradeConfigSyncFailureOver4HrSRE" , clusterID )
751
+ Expect (err ).NotTo (HaveOccurred (), "Failed to trigger UpgradeConfigSyncFailureOver4Hr PagerDuty alert" )
752
+
753
+ // Wait for the investigation to process
754
+ ginkgo .GinkgoWriter .Print ("Waiting for investigation to process corrupted pull secret...\n " )
755
+ time .Sleep (2 * time .Minute )
756
+
757
+ // Get limited support reasons after corruption
758
+ lsResponseAfter , err := utils .GetLimitedSupportReasons (ocme2eCli , clusterID )
759
+ if err != nil {
760
+ ginkgo .GinkgoWriter .Printf ("Could not get limited support reasons after test: %v\n " , err )
761
+ } else {
762
+ // Print the response data
763
+ fmt .Println ("Limited Support Response After Pull Secret Corruption:" )
764
+ fmt .Printf ("Total items: %d\n " , lsResponseAfter .Items ().Len ())
765
+
766
+ // Iterate through each item and print details
767
+ items := lsResponseAfter .Items ().Slice ()
768
+ for i , item := range items {
769
+ fmt .Printf ("Reason #%d:\n " , i + 1 )
770
+ fmt .Printf (" - Summary: %s\n " , item .Summary ())
771
+ fmt .Printf (" - Details: %s\n " , item .Details ())
772
+ }
773
+
774
+ // Compare with before if we had baseline data
775
+ if lsReasonsBefore >= 0 {
776
+ if lsResponseAfter .Items ().Len () > lsReasonsBefore {
777
+ ginkgo .GinkgoWriter .Printf ("Limited support reasons increased from %d to %d\n " ,
778
+ lsReasonsBefore , lsResponseAfter .Items ().Len ())
779
+ } else {
780
+ ginkgo .GinkgoWriter .Printf ("Limited support reasons remained at %d\n " ,
781
+ lsResponseAfter .Items ().Len ())
782
+ }
783
+ }
784
+ }
785
+
786
+ fmt .Println ("Test completed: UpgradeConfigSyncFailureOver4Hr investigation simulated successfully" )
787
+ })
788
+
675
789
}, ginkgo .ContinueOnFailure )
0 commit comments