@@ -5,38 +5,54 @@ package osde2etests
5
5
6
6
import (
7
7
"context"
8
+ "fmt"
8
9
"os"
10
+ "time"
9
11
10
- "github.com/aws/aws-sdk-go/aws "
11
- "github.com/aws/aws-sdk-go/aws /credentials"
12
- "github.com/aws/aws-sdk-go/aws/session "
12
+ "github.com/aws/aws-sdk-go-v2/config "
13
+ "github.com/aws/aws-sdk-go-v2 /credentials"
14
+ "github.com/aws/aws-sdk-go-v2/service/ec2 "
13
15
"github.com/onsi/ginkgo/v2"
14
16
. "github.com/onsi/ginkgo/v2"
15
17
. "github.com/onsi/gomega"
18
+ awsinternal "github.com/openshift/configuration-anomaly-detection/pkg/aws"
19
+ "github.com/openshift/configuration-anomaly-detection/pkg/ocm"
16
20
ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm"
17
21
"github.com/openshift/osde2e-common/pkg/clients/openshift"
22
+ appsv1 "k8s.io/api/apps/v1"
23
+ "k8s.io/client-go/util/retry"
18
24
logger "sigs.k8s.io/controller-runtime/pkg/log"
19
25
)
20
26
21
27
var _ = Describe ("Configuration Anomaly Detection" , Ordered , func () {
22
28
var (
23
- ocmCli * ocme2e.Client
24
- k8s * openshift.Client
25
- region string
26
- provider string
29
+ ocme2eCli * ocme2e.Client
30
+ ocmCli ocm.Client
31
+ k8s * openshift.Client
32
+ region string
33
+ provider string
34
+ clusterID string
27
35
)
28
36
29
- ginkgo . BeforeAll (func (ctx context.Context ) {
37
+ BeforeAll (func (ctx context.Context ) {
30
38
logger .SetLogger (ginkgo .GinkgoLogr )
31
-
32
39
var err error
33
-
34
40
ocmEnv := ocme2e .Stage
35
41
ocmToken := os .Getenv ("OCM_TOKEN" )
36
42
clientID := os .Getenv ("CLIENT_ID" )
37
43
clientSecret := os .Getenv ("CLIENT_SECRET" )
38
- ocmCli , err = ocme2e .New (ctx , ocmToken , clientID , clientSecret , ocmEnv )
39
- Expect (err ).ShouldNot (HaveOccurred (), "Unable to setup ocm client" )
44
+ clusterID = os .Getenv ("CLUSTER_ID" )
45
+ cadOcmFilePath := os .Getenv ("CAD_OCM_FILE_PATH" )
46
+
47
+ Expect (ocmToken ).NotTo (BeEmpty (), "OCM_TOKEN must be set" )
48
+ Expect (clusterID ).NotTo (BeEmpty (), "CLUSTER_ID must be set" )
49
+ Expect (cadOcmFilePath ).NotTo (BeEmpty (), "CAD_OCM_FILE_PATH must be set" )
50
+
51
+ ocme2eCli , err = ocme2e .New (ctx , ocmToken , clientID , clientSecret , ocmEnv )
52
+ Expect (err ).ShouldNot (HaveOccurred (), "Unable to setup E2E OCM Client" )
53
+
54
+ ocmCli , err = ocm .New (cadOcmFilePath )
55
+ Expect (err ).ShouldNot (HaveOccurred (), "Unable to setup ocm anomaly detection client" )
40
56
41
57
k8s , err = openshift .New (ginkgo .GinkgoLogr )
42
58
Expect (err ).ShouldNot (HaveOccurred (), "Unable to setup k8s client" )
@@ -48,24 +64,206 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
48
64
Expect (err ).NotTo (HaveOccurred (), "Could not determine provider" )
49
65
})
50
66
51
- ginkgo .It ("can fetch service logs" , func (ctx context.Context ) {
67
+ AfterAll (func () {
68
+ if ocme2eCli != nil && ocme2eCli .Connection != nil {
69
+ ocme2eCli .Connection .Close ()
70
+ }
71
+ })
72
+
73
+ It ("AWS CCS: cluster has gone missing (blocked egress)" , Label ("aws" , "ccs" , "chgm" , "limited-support" , "blocking-egress" ), func (ctx context.Context ) {
52
74
if provider == "aws" {
53
75
awsAccessKey := os .Getenv ("AWS_ACCESS_KEY_ID" )
54
76
awsSecretKey := os .Getenv ("AWS_SECRET_ACCESS_KEY" )
55
- Expect (awsAccessKey ).NotTo (BeEmpty (), "awsAccessKey not found" )
56
- Expect (awsSecretKey ).NotTo (BeEmpty (), "awsSecretKey not found" )
57
-
58
- _ , err := session .NewSession (aws .NewConfig ().WithCredentials (credentials .NewStaticCredentials (awsAccessKey , awsSecretKey , "" )).WithRegion (region ))
59
- Expect (err ).NotTo (HaveOccurred (), "Could not set up aws session" )
60
- Expect (err ).NotTo (HaveOccurred (), "Unable to get service logs for cluster" )
61
-
62
- // TODO(SDE-4821): Add the following tests
63
- // AWS CCS: cluster has gone missing (no known misconfiguration)
64
- // AWS CCS: cluster has gone missing (blocked egress)
65
- // AWS CCS: cluster has gone missing (infra nodes turned off)
66
- // AWS CCS: monitoring error budget burn (misconfigured UWM configmap)
67
- // AWS CCS: monitoring errror budget burn (no known misconfiguration
68
- ocmCli .Connection .Close ()
77
+ Expect (awsAccessKey ).NotTo (BeEmpty (), "AWS access key not found" )
78
+ Expect (awsSecretKey ).NotTo (BeEmpty (), "AWS secret key not found" )
79
+
80
+ awsCfg , err := config .LoadDefaultConfig (ctx ,
81
+ config .WithRegion (region ),
82
+ config .WithCredentialsProvider (credentials .NewStaticCredentialsProvider (
83
+ awsAccessKey ,
84
+ awsSecretKey ,
85
+ "" ,
86
+ )),
87
+ )
88
+ Expect (err ).NotTo (HaveOccurred (), "Failed to create AWS config" )
89
+
90
+ ec2Client := ec2 .NewFromConfig (awsCfg )
91
+ ec2Wrapper := NewEC2ClientWrapper (ec2Client )
92
+
93
+ awsCli , err := awsinternal .NewClient (awsAccessKey , awsSecretKey , "" , region )
94
+ Expect (err ).NotTo (HaveOccurred (), "Failed to create AWS client" )
95
+
96
+ clusterResource , err := ocme2eCli .ClustersMgmt ().V1 ().Clusters ().Cluster (clusterID ).Get ().Send ()
97
+ Expect (err ).NotTo (HaveOccurred (), "Failed to fetch cluster from OCM" )
98
+
99
+ cluster := clusterResource .Body ()
100
+ infraID := cluster .InfraID ()
101
+ Expect (infraID ).NotTo (BeEmpty (), "InfraID missing from cluster" )
102
+
103
+ sgID , err := awsCli .GetSecurityGroupID (infraID )
104
+ Expect (err ).NotTo (HaveOccurred (), "Failed to get security group ID" )
105
+
106
+ // Get limited support reasons before blocking egress
107
+ lsResponseBefore , err := GetLimitedSupportReasons (ocme2eCli , clusterID )
108
+ Expect (err ).NotTo (HaveOccurred (), "Failed to get limited support reasons" )
109
+ lsReasonsBefore := lsResponseBefore .Items ().Len ()
110
+
111
+ ginkgo .GinkgoWriter .Printf ("Limited support reasons before blocking egress: %d\n " , lsReasonsBefore )
112
+ ginkgo .GinkgoWriter .Printf ("Blocking egress for security group: %s\n " , sgID )
113
+
114
+ // Block egress
115
+ Expect (BlockEgress (ctx , ec2Wrapper , sgID )).To (Succeed (), "Failed to block egress" )
116
+ ginkgo .GinkgoWriter .Printf ("Egress blocked\n " )
117
+
118
+ time .Sleep (20 * time .Minute )
119
+
120
+ lsResponseAfter , err := GetLimitedSupportReasons (ocme2eCli , clusterID )
121
+ Expect (err ).NotTo (HaveOccurred (), "Failed to get limited support reasons" )
122
+
123
+ // Print the response data
124
+ fmt .Println ("Limited Support Response After Blocking Egress:" )
125
+ fmt .Printf ("Total items: %d\n " , lsResponseAfter .Items ().Len ())
126
+
127
+ // Iterate through each item and print details
128
+ items := lsResponseAfter .Items ().Slice ()
129
+ for i , item := range items {
130
+ fmt .Printf ("Reason #%d:\n " , i + 1 )
131
+ fmt .Printf (" - Summary: %s\n " , item .Summary ())
132
+ fmt .Printf (" - Details: %s\n " , item .Details ())
133
+ }
134
+
135
+ // Restore egress
136
+ Expect (RestoreEgress (ctx , ec2Wrapper , sgID )).To (Succeed (), "Failed to restore egress" )
137
+ ginkgo .GinkgoWriter .Printf ("Egress restored\n " )
138
+ }
139
+ })
140
+
141
+ It ("AWS CCS: cluster has gone missing (no known misconfiguration)" , func (ctx context.Context ) {
142
+ if provider == "aws" {
143
+ // Get cluster information from OCM
144
+ response , err := ocme2eCli .ClustersMgmt ().V1 ().Clusters ().Cluster (clusterID ).Get ().Send ()
145
+ Expect (err ).ToNot (HaveOccurred (), "failed to get cluster from OCM" )
146
+ cluster := response .Body ()
147
+ Expect (cluster ).ToNot (BeNil (), "received nil cluster from OCM" )
148
+
149
+ // Get service logs
150
+ logs , err := GetServiceLogs (ocmCli , cluster )
151
+ Expect (err ).ToNot (HaveOccurred (), "Failed to get service logs" )
152
+ logsBefore := logs .Items ().Slice ()
153
+
154
+ lsResponseBefore , err := GetLimitedSupportReasons (ocme2eCli , clusterID )
155
+ Expect (err ).NotTo (HaveOccurred (), "Failed to get limited support reasons" )
156
+ lsReasonsBefore := lsResponseBefore .Items ().Len ()
157
+
158
+ var zero int32 = 0
159
+
160
+ // Step 1: Scale down cluster-monitoring-operator with retry
161
+ fmt .Println ("Step 1: Scaling down cluster-monitoring-operator" )
162
+ var originalCMOReplicas int32
163
+ err = retry .RetryOnConflict (retry .DefaultRetry , func () error {
164
+ cmo := & appsv1.Deployment {}
165
+ err := k8s .Get (ctx , "cluster-monitoring-operator" , "openshift-monitoring" , cmo )
166
+ if err != nil {
167
+ return err
168
+ }
169
+ originalCMOReplicas = * cmo .Spec .Replicas
170
+ cmo .Spec .Replicas = & zero
171
+ return k8s .Update (ctx , cmo )
172
+ })
173
+ Expect (err ).ToNot (HaveOccurred (), "failed to scale down cluster-monitoring-operator" )
174
+ fmt .Printf ("Scaled down cluster-monitoring-operator from %d to 0 replicas\n " , originalCMOReplicas )
175
+
176
+ // Step 2: Scale down prometheus-operator with retry
177
+ fmt .Println ("Step 2: Scaling down prometheus-operator" )
178
+ var originalPOReplicas int32
179
+ err = retry .RetryOnConflict (retry .DefaultRetry , func () error {
180
+ po := & appsv1.Deployment {}
181
+ err := k8s .Get (ctx , "prometheus-operator" , "openshift-monitoring" , po )
182
+ if err != nil {
183
+ return err
184
+ }
185
+ originalPOReplicas = * po .Spec .Replicas
186
+ po .Spec .Replicas = & zero
187
+ return k8s .Update (ctx , po )
188
+ })
189
+ Expect (err ).ToNot (HaveOccurred (), "failed to scale down prometheus-operator" )
190
+ fmt .Printf ("Scaled down prometheus-operator from %d to 0 replicas\n " , originalPOReplicas )
191
+
192
+ // Step 3: Scale down alertmanager-main with retry
193
+ fmt .Println ("Step 3: Scaling down alertmanager-main" )
194
+ var originalAMReplicas int32
195
+ err = retry .RetryOnConflict (retry .DefaultRetry , func () error {
196
+ sts := & appsv1.StatefulSet {}
197
+ err := k8s .Get (ctx , "alertmanager-main" , "openshift-monitoring" , sts )
198
+ if err != nil {
199
+ return err
200
+ }
201
+ originalAMReplicas = * sts .Spec .Replicas
202
+ sts .Spec .Replicas = & zero
203
+ return k8s .Update (ctx , sts )
204
+ })
205
+ Expect (err ).ToNot (HaveOccurred (), "failed to scale down alertmanager" )
206
+ fmt .Printf ("Alertmanager scaled down from %d to 0 replicas. Waiting...\n " , originalAMReplicas )
207
+
208
+ time .Sleep (20 * time .Minute )
209
+
210
+ logs , err = GetServiceLogs (ocmCli , cluster )
211
+ Expect (err ).ToNot (HaveOccurred (), "Failed to get service logs" )
212
+ logsAfter := logs .Items ().Slice ()
213
+
214
+ lsResponseAfter , err := GetLimitedSupportReasons (ocme2eCli , clusterID )
215
+ Expect (err ).NotTo (HaveOccurred (), "Failed to get limited support reasons" )
216
+ lsReasonsAfter := lsResponseAfter .Items ().Len ()
217
+
218
+ // Step 5: Scale alertmanager-main back up with retry
219
+ fmt .Println ("Step 5: Scaling alertmanager-main back up" )
220
+ err = retry .RetryOnConflict (retry .DefaultRetry , func () error {
221
+ sts := & appsv1.StatefulSet {}
222
+ err := k8s .Get (ctx , "alertmanager-main" , "openshift-monitoring" , sts )
223
+ if err != nil {
224
+ return err
225
+ }
226
+ replicas := originalAMReplicas
227
+ sts .Spec .Replicas = & replicas
228
+ return k8s .Update (ctx , sts )
229
+ })
230
+ Expect (err ).ToNot (HaveOccurred (), "failed to scale up alertmanager" )
231
+ fmt .Printf ("Alertmanager scaled back up to %d replicas\n " , originalAMReplicas )
232
+
233
+ // Step 6: Scale prometheus-operator back up with retry
234
+ fmt .Println ("Step 6: Scaling prometheus-operator back up" )
235
+ err = retry .RetryOnConflict (retry .DefaultRetry , func () error {
236
+ po := & appsv1.Deployment {}
237
+ err := k8s .Get (ctx , "prometheus-operator" , "openshift-monitoring" , po )
238
+ if err != nil {
239
+ return err
240
+ }
241
+ replicas := originalPOReplicas
242
+ po .Spec .Replicas = & replicas
243
+ return k8s .Update (ctx , po )
244
+ })
245
+ Expect (err ).ToNot (HaveOccurred (), "failed to scale up prometheus-operator" )
246
+ fmt .Printf ("Prometheus-operator scaled back up to %d replicas\n " , originalPOReplicas )
247
+
248
+ // Step 7: Scale cluster-monitoring-operator back up with retry
249
+ fmt .Println ("Step 7: Scaling cluster-monitoring-operator back up" )
250
+ err = retry .RetryOnConflict (retry .DefaultRetry , func () error {
251
+ cmo := & appsv1.Deployment {}
252
+ err := k8s .Get (ctx , "cluster-monitoring-operator" , "openshift-monitoring" , cmo )
253
+ if err != nil {
254
+ return err
255
+ }
256
+ replicas := originalCMOReplicas
257
+ cmo .Spec .Replicas = & replicas
258
+ return k8s .Update (ctx , cmo )
259
+ })
260
+ Expect (err ).ToNot (HaveOccurred (), "failed to scale up cluster-monitoring-operator" )
261
+ fmt .Printf ("Cluster-monitoring-operator scaled back up to %d replicas\n " , originalCMOReplicas )
262
+
263
+ Expect (logsAfter ).To (HaveLen (len (logsBefore )), "Service logs count changed after scale down/up" )
264
+ Expect (lsReasonsAfter ).To (Equal (lsReasonsBefore ), "Limited support reasons changed after scale down/up" )
265
+
266
+ fmt .Println ("Test completed: All components restored to original replica counts." )
69
267
}
70
268
})
71
269
})
0 commit comments