Skip to content

Commit 88be4a3

Browse files
authored
OSD-29468: Configuration Anomaly Detection - E2E Test - Blocked Egress/ No Known MIsconfiguration (#431)
* Initial commit * Blocked egress commit * Test cases for CHGM - No known misconfiguration * Test cases for CHGM - No known misconfiguration * Fixed lint issue * Fixing a few last set of issues * Made enhancements as per the comments * Made changes as per the comments * Final changes
1 parent 19709d6 commit 88be4a3

File tree

4 files changed

+336
-28
lines changed

4 files changed

+336
-28
lines changed

pkg/aws/aws.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ import (
1515
configv2 "github.com/aws/aws-sdk-go-v2/config"
1616
credentialsv2 "github.com/aws/aws-sdk-go-v2/credentials"
1717
cloudtrailv2 "github.com/aws/aws-sdk-go-v2/service/cloudtrail"
18-
1918
cloudtrailv2types "github.com/aws/aws-sdk-go-v2/service/cloudtrail/types"
2019
ec2v2 "github.com/aws/aws-sdk-go-v2/service/ec2"
2120
ec2v2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"

test/e2e/aws.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
package osde2etests
2+
3+
import (
4+
"context"
5+
"fmt"
6+
7+
"github.com/aws/aws-sdk-go-v2/service/ec2"
8+
"github.com/aws/aws-sdk-go-v2/service/ec2/types"
9+
)
10+
11+
// EC2API interface to make testing easier
12+
type EC2API interface {
13+
RevokeSecurityGroupEgress(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error)
14+
AuthorizeSecurityGroupEgress(ctx context.Context, params *ec2.AuthorizeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.AuthorizeSecurityGroupEgressOutput, error)
15+
}
16+
17+
// EC2ClientWrapper wraps the AWS SDK EC2 client to implement our EC2API interface
18+
type EC2ClientWrapper struct {
19+
Client *ec2.Client
20+
}
21+
22+
// RevokeSecurityGroupEgress implements EC2API
23+
func (w *EC2ClientWrapper) RevokeSecurityGroupEgress(ctx context.Context, params *ec2.RevokeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupEgressOutput, error) {
24+
return w.Client.RevokeSecurityGroupEgress(ctx, params, optFns...)
25+
}
26+
27+
// AuthorizeSecurityGroupEgress implements EC2API
28+
func (w *EC2ClientWrapper) AuthorizeSecurityGroupEgress(ctx context.Context, params *ec2.AuthorizeSecurityGroupEgressInput, optFns ...func(*ec2.Options)) (*ec2.AuthorizeSecurityGroupEgressOutput, error) {
29+
return w.Client.AuthorizeSecurityGroupEgress(ctx, params, optFns...)
30+
}
31+
32+
// NewEC2ClientWrapper creates a new EC2ClientWrapper that implements EC2API
33+
func NewEC2ClientWrapper(client *ec2.Client) *EC2ClientWrapper {
34+
return &EC2ClientWrapper{Client: client}
35+
}
36+
37+
// BlockEgress revokes all outbound traffic from the security group
38+
func BlockEgress(ctx context.Context, ec2Client EC2API, securityGroupID string) error {
39+
input := &ec2.RevokeSecurityGroupEgressInput{
40+
GroupId: &securityGroupID,
41+
IpPermissions: []types.IpPermission{
42+
{
43+
IpProtocol: awsString("-1"), // -1 = all protocols
44+
IpRanges: []types.IpRange{
45+
{CidrIp: awsString("0.0.0.0/0")},
46+
},
47+
},
48+
},
49+
}
50+
_, err := ec2Client.RevokeSecurityGroupEgress(ctx, input)
51+
if err != nil {
52+
return fmt.Errorf("failed to revoke egress: %w", err)
53+
}
54+
return nil
55+
}
56+
57+
// RestoreEgress allows all outbound traffic from the security group
58+
func RestoreEgress(ctx context.Context, ec2Client EC2API, securityGroupID string) error {
59+
input := &ec2.AuthorizeSecurityGroupEgressInput{
60+
GroupId: &securityGroupID,
61+
IpPermissions: []types.IpPermission{
62+
{
63+
IpProtocol: awsString("-1"),
64+
IpRanges: []types.IpRange{
65+
{CidrIp: awsString("0.0.0.0/0")},
66+
},
67+
},
68+
},
69+
}
70+
_, err := ec2Client.AuthorizeSecurityGroupEgress(ctx, input)
71+
if err != nil {
72+
return fmt.Errorf("failed to restore egress: %w", err)
73+
}
74+
return nil
75+
}
76+
77+
// awsString helper function to convert a string to a pointer
78+
func awsString(value string) *string {
79+
return &value
80+
}

test/e2e/configuration_anomaly_detection_test.go

Lines changed: 225 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,38 +5,54 @@ package osde2etests
55

66
import (
77
"context"
8+
"fmt"
89
"os"
10+
"time"
911

10-
"github.com/aws/aws-sdk-go/aws"
11-
"github.com/aws/aws-sdk-go/aws/credentials"
12-
"github.com/aws/aws-sdk-go/aws/session"
12+
"github.com/aws/aws-sdk-go-v2/config"
13+
"github.com/aws/aws-sdk-go-v2/credentials"
14+
"github.com/aws/aws-sdk-go-v2/service/ec2"
1315
"github.com/onsi/ginkgo/v2"
1416
. "github.com/onsi/ginkgo/v2"
1517
. "github.com/onsi/gomega"
18+
awsinternal "github.com/openshift/configuration-anomaly-detection/pkg/aws"
19+
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
1620
ocme2e "github.com/openshift/osde2e-common/pkg/clients/ocm"
1721
"github.com/openshift/osde2e-common/pkg/clients/openshift"
22+
appsv1 "k8s.io/api/apps/v1"
23+
"k8s.io/client-go/util/retry"
1824
logger "sigs.k8s.io/controller-runtime/pkg/log"
1925
)
2026

2127
var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
2228
var (
23-
ocmCli *ocme2e.Client
24-
k8s *openshift.Client
25-
region string
26-
provider string
29+
ocme2eCli *ocme2e.Client
30+
ocmCli ocm.Client
31+
k8s *openshift.Client
32+
region string
33+
provider string
34+
clusterID string
2735
)
2836

29-
ginkgo.BeforeAll(func(ctx context.Context) {
37+
BeforeAll(func(ctx context.Context) {
3038
logger.SetLogger(ginkgo.GinkgoLogr)
31-
3239
var err error
33-
3440
ocmEnv := ocme2e.Stage
3541
ocmToken := os.Getenv("OCM_TOKEN")
3642
clientID := os.Getenv("CLIENT_ID")
3743
clientSecret := os.Getenv("CLIENT_SECRET")
38-
ocmCli, err = ocme2e.New(ctx, ocmToken, clientID, clientSecret, ocmEnv)
39-
Expect(err).ShouldNot(HaveOccurred(), "Unable to setup ocm client")
44+
clusterID = os.Getenv("CLUSTER_ID")
45+
cadOcmFilePath := os.Getenv("CAD_OCM_FILE_PATH")
46+
47+
Expect(ocmToken).NotTo(BeEmpty(), "OCM_TOKEN must be set")
48+
Expect(clusterID).NotTo(BeEmpty(), "CLUSTER_ID must be set")
49+
Expect(cadOcmFilePath).NotTo(BeEmpty(), "CAD_OCM_FILE_PATH must be set")
50+
51+
ocme2eCli, err = ocme2e.New(ctx, ocmToken, clientID, clientSecret, ocmEnv)
52+
Expect(err).ShouldNot(HaveOccurred(), "Unable to setup E2E OCM Client")
53+
54+
ocmCli, err = ocm.New(cadOcmFilePath)
55+
Expect(err).ShouldNot(HaveOccurred(), "Unable to setup ocm anomaly detection client")
4056

4157
k8s, err = openshift.New(ginkgo.GinkgoLogr)
4258
Expect(err).ShouldNot(HaveOccurred(), "Unable to setup k8s client")
@@ -48,24 +64,206 @@ var _ = Describe("Configuration Anomaly Detection", Ordered, func() {
4864
Expect(err).NotTo(HaveOccurred(), "Could not determine provider")
4965
})
5066

51-
ginkgo.It("can fetch service logs", func(ctx context.Context) {
67+
AfterAll(func() {
68+
if ocme2eCli != nil && ocme2eCli.Connection != nil {
69+
ocme2eCli.Connection.Close()
70+
}
71+
})
72+
73+
It("AWS CCS: cluster has gone missing (blocked egress)", Label("aws", "ccs", "chgm", "limited-support", "blocking-egress"), func(ctx context.Context) {
5274
if provider == "aws" {
5375
awsAccessKey := os.Getenv("AWS_ACCESS_KEY_ID")
5476
awsSecretKey := os.Getenv("AWS_SECRET_ACCESS_KEY")
55-
Expect(awsAccessKey).NotTo(BeEmpty(), "awsAccessKey not found")
56-
Expect(awsSecretKey).NotTo(BeEmpty(), "awsSecretKey not found")
57-
58-
_, err := session.NewSession(aws.NewConfig().WithCredentials(credentials.NewStaticCredentials(awsAccessKey, awsSecretKey, "")).WithRegion(region))
59-
Expect(err).NotTo(HaveOccurred(), "Could not set up aws session")
60-
Expect(err).NotTo(HaveOccurred(), "Unable to get service logs for cluster")
61-
62-
// TODO(SDE-4821): Add the following tests
63-
// AWS CCS: cluster has gone missing (no known misconfiguration)
64-
// AWS CCS: cluster has gone missing (blocked egress)
65-
// AWS CCS: cluster has gone missing (infra nodes turned off)
66-
// AWS CCS: monitoring error budget burn (misconfigured UWM configmap)
67-
// AWS CCS: monitoring errror budget burn (no known misconfiguration
68-
ocmCli.Connection.Close()
77+
Expect(awsAccessKey).NotTo(BeEmpty(), "AWS access key not found")
78+
Expect(awsSecretKey).NotTo(BeEmpty(), "AWS secret key not found")
79+
80+
awsCfg, err := config.LoadDefaultConfig(ctx,
81+
config.WithRegion(region),
82+
config.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(
83+
awsAccessKey,
84+
awsSecretKey,
85+
"",
86+
)),
87+
)
88+
Expect(err).NotTo(HaveOccurred(), "Failed to create AWS config")
89+
90+
ec2Client := ec2.NewFromConfig(awsCfg)
91+
ec2Wrapper := NewEC2ClientWrapper(ec2Client)
92+
93+
awsCli, err := awsinternal.NewClient(awsAccessKey, awsSecretKey, "", region)
94+
Expect(err).NotTo(HaveOccurred(), "Failed to create AWS client")
95+
96+
clusterResource, err := ocme2eCli.ClustersMgmt().V1().Clusters().Cluster(clusterID).Get().Send()
97+
Expect(err).NotTo(HaveOccurred(), "Failed to fetch cluster from OCM")
98+
99+
cluster := clusterResource.Body()
100+
infraID := cluster.InfraID()
101+
Expect(infraID).NotTo(BeEmpty(), "InfraID missing from cluster")
102+
103+
sgID, err := awsCli.GetSecurityGroupID(infraID)
104+
Expect(err).NotTo(HaveOccurred(), "Failed to get security group ID")
105+
106+
// Get limited support reasons before blocking egress
107+
lsResponseBefore, err := GetLimitedSupportReasons(ocme2eCli, clusterID)
108+
Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons")
109+
lsReasonsBefore := lsResponseBefore.Items().Len()
110+
111+
ginkgo.GinkgoWriter.Printf("Limited support reasons before blocking egress: %d\n", lsReasonsBefore)
112+
ginkgo.GinkgoWriter.Printf("Blocking egress for security group: %s\n", sgID)
113+
114+
// Block egress
115+
Expect(BlockEgress(ctx, ec2Wrapper, sgID)).To(Succeed(), "Failed to block egress")
116+
ginkgo.GinkgoWriter.Printf("Egress blocked\n")
117+
118+
time.Sleep(20 * time.Minute)
119+
120+
lsResponseAfter, err := GetLimitedSupportReasons(ocme2eCli, clusterID)
121+
Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons")
122+
123+
// Print the response data
124+
fmt.Println("Limited Support Response After Blocking Egress:")
125+
fmt.Printf("Total items: %d\n", lsResponseAfter.Items().Len())
126+
127+
// Iterate through each item and print details
128+
items := lsResponseAfter.Items().Slice()
129+
for i, item := range items {
130+
fmt.Printf("Reason #%d:\n", i+1)
131+
fmt.Printf(" - Summary: %s\n", item.Summary())
132+
fmt.Printf(" - Details: %s\n", item.Details())
133+
}
134+
135+
// Restore egress
136+
Expect(RestoreEgress(ctx, ec2Wrapper, sgID)).To(Succeed(), "Failed to restore egress")
137+
ginkgo.GinkgoWriter.Printf("Egress restored\n")
138+
}
139+
})
140+
141+
It("AWS CCS: cluster has gone missing (no known misconfiguration)", func(ctx context.Context) {
142+
if provider == "aws" {
143+
// Get cluster information from OCM
144+
response, err := ocme2eCli.ClustersMgmt().V1().Clusters().Cluster(clusterID).Get().Send()
145+
Expect(err).ToNot(HaveOccurred(), "failed to get cluster from OCM")
146+
cluster := response.Body()
147+
Expect(cluster).ToNot(BeNil(), "received nil cluster from OCM")
148+
149+
// Get service logs
150+
logs, err := GetServiceLogs(ocmCli, cluster)
151+
Expect(err).ToNot(HaveOccurred(), "Failed to get service logs")
152+
logsBefore := logs.Items().Slice()
153+
154+
lsResponseBefore, err := GetLimitedSupportReasons(ocme2eCli, clusterID)
155+
Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons")
156+
lsReasonsBefore := lsResponseBefore.Items().Len()
157+
158+
var zero int32 = 0
159+
160+
// Step 1: Scale down cluster-monitoring-operator with retry
161+
fmt.Println("Step 1: Scaling down cluster-monitoring-operator")
162+
var originalCMOReplicas int32
163+
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
164+
cmo := &appsv1.Deployment{}
165+
err := k8s.Get(ctx, "cluster-monitoring-operator", "openshift-monitoring", cmo)
166+
if err != nil {
167+
return err
168+
}
169+
originalCMOReplicas = *cmo.Spec.Replicas
170+
cmo.Spec.Replicas = &zero
171+
return k8s.Update(ctx, cmo)
172+
})
173+
Expect(err).ToNot(HaveOccurred(), "failed to scale down cluster-monitoring-operator")
174+
fmt.Printf("Scaled down cluster-monitoring-operator from %d to 0 replicas\n", originalCMOReplicas)
175+
176+
// Step 2: Scale down prometheus-operator with retry
177+
fmt.Println("Step 2: Scaling down prometheus-operator")
178+
var originalPOReplicas int32
179+
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
180+
po := &appsv1.Deployment{}
181+
err := k8s.Get(ctx, "prometheus-operator", "openshift-monitoring", po)
182+
if err != nil {
183+
return err
184+
}
185+
originalPOReplicas = *po.Spec.Replicas
186+
po.Spec.Replicas = &zero
187+
return k8s.Update(ctx, po)
188+
})
189+
Expect(err).ToNot(HaveOccurred(), "failed to scale down prometheus-operator")
190+
fmt.Printf("Scaled down prometheus-operator from %d to 0 replicas\n", originalPOReplicas)
191+
192+
// Step 3: Scale down alertmanager-main with retry
193+
fmt.Println("Step 3: Scaling down alertmanager-main")
194+
var originalAMReplicas int32
195+
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
196+
sts := &appsv1.StatefulSet{}
197+
err := k8s.Get(ctx, "alertmanager-main", "openshift-monitoring", sts)
198+
if err != nil {
199+
return err
200+
}
201+
originalAMReplicas = *sts.Spec.Replicas
202+
sts.Spec.Replicas = &zero
203+
return k8s.Update(ctx, sts)
204+
})
205+
Expect(err).ToNot(HaveOccurred(), "failed to scale down alertmanager")
206+
fmt.Printf("Alertmanager scaled down from %d to 0 replicas. Waiting...\n", originalAMReplicas)
207+
208+
time.Sleep(20 * time.Minute)
209+
210+
logs, err = GetServiceLogs(ocmCli, cluster)
211+
Expect(err).ToNot(HaveOccurred(), "Failed to get service logs")
212+
logsAfter := logs.Items().Slice()
213+
214+
lsResponseAfter, err := GetLimitedSupportReasons(ocme2eCli, clusterID)
215+
Expect(err).NotTo(HaveOccurred(), "Failed to get limited support reasons")
216+
lsReasonsAfter := lsResponseAfter.Items().Len()
217+
218+
// Step 5: Scale alertmanager-main back up with retry
219+
fmt.Println("Step 5: Scaling alertmanager-main back up")
220+
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
221+
sts := &appsv1.StatefulSet{}
222+
err := k8s.Get(ctx, "alertmanager-main", "openshift-monitoring", sts)
223+
if err != nil {
224+
return err
225+
}
226+
replicas := originalAMReplicas
227+
sts.Spec.Replicas = &replicas
228+
return k8s.Update(ctx, sts)
229+
})
230+
Expect(err).ToNot(HaveOccurred(), "failed to scale up alertmanager")
231+
fmt.Printf("Alertmanager scaled back up to %d replicas\n", originalAMReplicas)
232+
233+
// Step 6: Scale prometheus-operator back up with retry
234+
fmt.Println("Step 6: Scaling prometheus-operator back up")
235+
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
236+
po := &appsv1.Deployment{}
237+
err := k8s.Get(ctx, "prometheus-operator", "openshift-monitoring", po)
238+
if err != nil {
239+
return err
240+
}
241+
replicas := originalPOReplicas
242+
po.Spec.Replicas = &replicas
243+
return k8s.Update(ctx, po)
244+
})
245+
Expect(err).ToNot(HaveOccurred(), "failed to scale up prometheus-operator")
246+
fmt.Printf("Prometheus-operator scaled back up to %d replicas\n", originalPOReplicas)
247+
248+
// Step 7: Scale cluster-monitoring-operator back up with retry
249+
fmt.Println("Step 7: Scaling cluster-monitoring-operator back up")
250+
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
251+
cmo := &appsv1.Deployment{}
252+
err := k8s.Get(ctx, "cluster-monitoring-operator", "openshift-monitoring", cmo)
253+
if err != nil {
254+
return err
255+
}
256+
replicas := originalCMOReplicas
257+
cmo.Spec.Replicas = &replicas
258+
return k8s.Update(ctx, cmo)
259+
})
260+
Expect(err).ToNot(HaveOccurred(), "failed to scale up cluster-monitoring-operator")
261+
fmt.Printf("Cluster-monitoring-operator scaled back up to %d replicas\n", originalCMOReplicas)
262+
263+
Expect(logsAfter).To(HaveLen(len(logsBefore)), "Service logs count changed after scale down/up")
264+
Expect(lsReasonsAfter).To(Equal(lsReasonsBefore), "Limited support reasons changed after scale down/up")
265+
266+
fmt.Println("Test completed: All components restored to original replica counts.")
69267
}
70268
})
71269
})

0 commit comments

Comments
 (0)