From 71b85967bb6e7967c0755de41cc3be6b2b84801b Mon Sep 17 00:00:00 2001 From: Marco Braga Date: Wed, 16 Jul 2025 00:34:38 -0300 Subject: [PATCH 1/3] fix/byosg: remove managed SG on BYOSG scenario for CLB Fix the "managed" (controller-owned) security group leak when user provided security group is added to an existing Service type-loadBalancer CLB. --- pkg/providers/v1/aws.go | 16 +-- pkg/providers/v1/aws_loadbalancer.go | 165 +++++++++++++++++++++++++-- 2 files changed, 156 insertions(+), 25 deletions(-) diff --git a/pkg/providers/v1/aws.go b/pkg/providers/v1/aws.go index 89b1d742da..6200738c4b 100644 --- a/pkg/providers/v1/aws.go +++ b/pkg/providers/v1/aws.go @@ -2940,19 +2940,9 @@ func (c *Cloud) updateInstanceSecurityGroupsForLoadBalancer(ctx context.Context, loadBalancerSecurityGroupID := lbSecurityGroupIDs[0] // Get the actual list of groups that allow ingress from the load-balancer - actualGroups := make(map[*ec2types.SecurityGroup]bool) - { - describeRequest := &ec2.DescribeSecurityGroupsInput{} - describeRequest.Filters = []ec2types.Filter{ - newEc2Filter("ip-permission.group-id", loadBalancerSecurityGroupID), - } - response, err := c.ec2.DescribeSecurityGroups(ctx, describeRequest) - if err != nil { - return fmt.Errorf("error querying security groups for ELB: %q", err) - } - for _, sg := range response { - actualGroups[&sg] = c.tagging.hasClusterTag(sg.Tags) - } + actualGroups, _, err := c.buildSecurityGroupRuleReferences(ctx, loadBalancerSecurityGroupID) + if err != nil { + return fmt.Errorf("error building security group rule references: %w", err) } // Open the firewall from the load balancer to the instance diff --git a/pkg/providers/v1/aws_loadbalancer.go b/pkg/providers/v1/aws_loadbalancer.go index f9dd598450..55df9ecf17 100644 --- a/pkg/providers/v1/aws_loadbalancer.go +++ b/pkg/providers/v1/aws_loadbalancer.go @@ -29,6 +29,7 @@ import ( "time" "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/ec2" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" elb "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing" @@ -1251,24 +1252,24 @@ func (c *Cloud) ensureLoadBalancer(ctx context.Context, namespacedName types.Nam { // Sync security groups - expected := sets.New[string](securityGroupIDs...) - actual := stringSetFromList(loadBalancer.SecurityGroups) + expected := sets.New(securityGroupIDs...) + actual := sets.New(loadBalancer.SecurityGroups...) if !expected.Equal(actual) { // This call just replaces the security groups, unlike e.g. subnets (!) - request := &elb.ApplySecurityGroupsToLoadBalancerInput{} - request.LoadBalancerName = aws.String(loadBalancerName) - if securityGroupIDs == nil { - request.SecurityGroups = nil - } else { - request.SecurityGroups = securityGroupIDs - } - klog.V(2).Info("Applying updated security groups to load balancer") - _, err := c.elb.ApplySecurityGroupsToLoadBalancer(ctx, request) - if err != nil { + klog.V(2).Infof("Applying updated security groups to load balancer %q", loadBalancerName) + if _, err := c.elb.ApplySecurityGroupsToLoadBalancer(ctx, &elb.ApplySecurityGroupsToLoadBalancerInput{ + LoadBalancerName: aws.String(loadBalancerName), + SecurityGroups: securityGroupIDs, + }); err != nil { return nil, fmt.Errorf("error applying AWS loadbalancer security groups: %q", err) } dirty = true + + // Ensure the replaced security groups are removed from AWS when owned by the controller. + if errs := c.removeOwnedSecurityGroups(ctx, loadBalancerName, actual.UnsortedList()); len(errs) > 0 { + return nil, fmt.Errorf("error removing owned security groups: %v", errs) + } } } @@ -1879,3 +1880,143 @@ func ValidateHealthCheck(s *elbtypes.HealthCheck) error { return nil } + +// isOwnedSecurityGroup checks if the security group is owned by the controller +// by checking if the security group has the cluster ownership tag +// (kubernetes.io/cluster/=owned). +// +// Parameters: +// - `ctx`: The context for the operation. +// - `securityGroupID`: The ID of the security group to check. +// +// Returns: +// - `bool`: True if the security group is owned by the controller, false otherwise. +// - `error`: An error if the security group cannot be retrieved, is not found, +// or if multiple security groups are found with the same ID (unexpected). +func (c *Cloud) isOwnedSecurityGroup(ctx context.Context, securityGroupID string) (bool, error) { + groups, err := c.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{ + GroupIds: []string{securityGroupID}, + }) + if err != nil { + return false, fmt.Errorf("error retrieving security group %q: %w", securityGroupID, err) + } + if len(groups) == 0 { + return false, fmt.Errorf("security group %q not found", securityGroupID) + } + if len(groups) != 1 { + // This should not be possible - ids should be unique + return false, fmt.Errorf("[BUG] multiple security groups(%d) found with same id %q", len(groups), securityGroupID) + } + return c.tagging.hasClusterTagOwned(groups[0].Tags) +} + +// buildSecurityGroupRuleReferences finds all security groups that have ingress rules +// referencing the specified security group ID, and categorizes them based on cluster tagging. +// This is used to identify dependencies before removing a security group. +// +// Parameters: +// - ctx: The context for the request. +// - sgID: The ID of the security group to find references for. +// +// Returns: +// - map[*ec2types.SecurityGroup]bool: All security groups with ingress rules referencing sgID, mapped to their cluster tag status (true/false). +// - map[*ec2types.SecurityGroup]IPPermissionSet: Only cluster-tagged security groups mapped to their ingress rules that reference sgID. +// - error: An error if the AWS DescribeSecurityGroups API call fails. +func (c *Cloud) buildSecurityGroupRuleReferences(ctx context.Context, sgID string) (map[*ec2types.SecurityGroup]bool, map[*ec2types.SecurityGroup]IPPermissionSet, error) { + groupsHasTags := make(map[*ec2types.SecurityGroup]bool) + groupsLinkedPermissions := make(map[*ec2types.SecurityGroup]IPPermissionSet) + sgsOut, err := c.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{ + Filters: []ec2types.Filter{ + newEc2Filter("ip-permission.group-id", sgID), + }, + }) + if err != nil { + return groupsHasTags, groupsLinkedPermissions, fmt.Errorf("error querying security groups for ELB: %q", err) + } + + for _, sg := range sgsOut { + groupsHasTags[&sg] = c.tagging.hasClusterTag(sg.Tags) + + groupsLinkedPermissions[&sg] = NewIPPermissionSet() + for _, rule := range sg.IpPermissions { + if rule.UserIdGroupPairs != nil { + for _, pair := range rule.UserIdGroupPairs { + if pair.GroupId != nil && aws.ToString(pair.GroupId) == sgID { + groupsLinkedPermissions[&sg].Insert(rule) + } + } + } + } + + } + return groupsHasTags, groupsLinkedPermissions, nil +} + +// removeOwnedSecurityGroups removes the CLB owned/managed security groups from AWS. +// It revokes ingress rules that reference the security groups to be removed, +// then deletes the security groups that are owned by the controller. +// This is used when updating load balancer security groups to clean up orphaned ones. +// +// Parameters: +// - `ctx`: The context for the operation. +// - `loadBalancerName`: The name of the load balancer (used for logging and deletion operations). +// - `securityGroups`: The list of security group IDs to process for removal. +// +// Returns: +// - `[]error`: Collection of all errors encountered during the removal process. +func (c *Cloud) removeOwnedSecurityGroups(ctx context.Context, loadBalancerName string, securityGroups []string) []error { + allErrs := []error{} + sgMap := make(map[string]struct{}) + + // Validate each security group references building a reading list to be deleted. + for _, sg := range securityGroups { + isOwned, err := c.isOwnedSecurityGroup(ctx, sg) + if err != nil { + allErrs = append(allErrs, fmt.Errorf("unable to validate if security group %q is owned by the controller: %w", sg, err)) + continue + } + + groupsWithClusterTag, groupsLinkedPermissions, err := c.buildSecurityGroupRuleReferences(ctx, sg) + if err != nil { + allErrs = append(allErrs, fmt.Errorf("error building security group rule references for %q: %w", sg, err)) + continue + } + + // Revoke ingress rules referencing the security group to be deleted + // from cluster-tagged security groups, when the referenced security + // group has no cluster tag, skip the revoke assuming it is user-managed. + for sgTarget, sgPerms := range groupsLinkedPermissions { + if !groupsWithClusterTag[sgTarget] { + klog.Warningf("security group %q has no cluster tag, skipping remove lifecycle after update", sg) + continue + } + + klog.Infof("revoking security group ingress references of %q from %q", sg, aws.ToString(sgTarget.GroupId)) + if _, err := c.ec2.RevokeSecurityGroupIngress(ctx, &ec2.RevokeSecurityGroupIngressInput{ + GroupId: sgTarget.GroupId, + IpPermissions: sgPerms.List(), + }); err != nil { + allErrs = append(allErrs, fmt.Errorf("error revoking security group ingress rules from %q: %w", aws.ToString(sgTarget.GroupId), err)) + continue + } + } + + // Skip security group removal when the security group is not owned by the controller. + if !isOwned { + klog.Warningf("security group %q is not owned by the controller, skipping remove lifecycle after update", sg) + continue + } + + klog.Infof("making loadbalancer owned security group %q ready for deletion", sg) + sgMap[sg] = struct{}{} + } + if len(sgMap) == 0 { + return allErrs + } + + if err := c.deleteSecurityGroupsWithBackoff(ctx, loadBalancerName, sgMap); err != nil { + return append(allErrs, fmt.Errorf("error deleting security groups %v: %v", sgMap, err)) + } + klog.Infof("loadbalancer owned security groups deleted!") + return nil +} From acd80220f80af3a549e6be5e8e413af20785fb5d Mon Sep 17 00:00:00 2001 From: Marco Braga Date: Wed, 16 Jul 2025 00:33:56 -0300 Subject: [PATCH 2/3] fix/byosg/tests: unit tests to handle managed SG removal on BYOSG Introduce unit tests for functions added to validate Service update to BYO Security Group annotations from a managed SG state. --- pkg/providers/v1/aws_loadbalancer.go | 29 -- pkg/providers/v1/aws_loadbalancer_test.go | 428 ++++++++++++++++++++++ pkg/providers/v1/aws_test.go | 13 +- pkg/providers/v1/tags_test.go | 10 +- 4 files changed, 441 insertions(+), 39 deletions(-) diff --git a/pkg/providers/v1/aws_loadbalancer.go b/pkg/providers/v1/aws_loadbalancer.go index 55df9ecf17..201a1500e3 100644 --- a/pkg/providers/v1/aws_loadbalancer.go +++ b/pkg/providers/v1/aws_loadbalancer.go @@ -1881,35 +1881,6 @@ func ValidateHealthCheck(s *elbtypes.HealthCheck) error { return nil } -// isOwnedSecurityGroup checks if the security group is owned by the controller -// by checking if the security group has the cluster ownership tag -// (kubernetes.io/cluster/=owned). -// -// Parameters: -// - `ctx`: The context for the operation. -// - `securityGroupID`: The ID of the security group to check. -// -// Returns: -// - `bool`: True if the security group is owned by the controller, false otherwise. -// - `error`: An error if the security group cannot be retrieved, is not found, -// or if multiple security groups are found with the same ID (unexpected). -func (c *Cloud) isOwnedSecurityGroup(ctx context.Context, securityGroupID string) (bool, error) { - groups, err := c.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{ - GroupIds: []string{securityGroupID}, - }) - if err != nil { - return false, fmt.Errorf("error retrieving security group %q: %w", securityGroupID, err) - } - if len(groups) == 0 { - return false, fmt.Errorf("security group %q not found", securityGroupID) - } - if len(groups) != 1 { - // This should not be possible - ids should be unique - return false, fmt.Errorf("[BUG] multiple security groups(%d) found with same id %q", len(groups), securityGroupID) - } - return c.tagging.hasClusterTagOwned(groups[0].Tags) -} - // buildSecurityGroupRuleReferences finds all security groups that have ingress rules // referencing the specified security group ID, and categorizes them based on cluster tagging. // This is used to identify dependencies before removing a security group. diff --git a/pkg/providers/v1/aws_loadbalancer_test.go b/pkg/providers/v1/aws_loadbalancer_test.go index 39037205e8..2fb44d558b 100644 --- a/pkg/providers/v1/aws_loadbalancer_test.go +++ b/pkg/providers/v1/aws_loadbalancer_test.go @@ -18,6 +18,7 @@ package aws import ( "context" + "errors" "fmt" "reflect" "testing" @@ -28,11 +29,14 @@ import ( "k8s.io/apimachinery/pkg/types" "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/ec2" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" elbtypes "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing/types" elbv2 "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2" elbv2types "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2/types" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" "k8s.io/cloud-provider-aws/pkg/providers/v1/config" ) @@ -1376,6 +1380,232 @@ func TestCreateSubnetMappings(t *testing.T) { } } +func TestCloud_removeOwnedSecurityGroups(t *testing.T) { + tests := []struct { + name string + securityGroups []string + setupMocks func(*MockedFakeEC2) + setupSecurityGroupTags func() map[string][]ec2types.Tag + expectError bool + expectRevokeCallCount int + expectDeleteCallCount int + }{ + { + name: "successfully remove owned security groups", + securityGroups: []string{"sg-owned1", "sg-owned2"}, + setupMocks: func(mockedEC2 *MockedFakeEC2) { + // Mock DescribeSecurityGroups for ownership check + mockedEC2.On("DescribeSecurityGroups", mock.MatchedBy(func(input *ec2.DescribeSecurityGroupsInput) bool { + return len(input.GroupIds) == 1 && (input.GroupIds[0] == "sg-owned1" || input.GroupIds[0] == "sg-owned2") + })).Return([]ec2types.SecurityGroup{ + { + GroupId: aws.String("sg-owned1"), + Tags: []ec2types.Tag{ + { + Key: aws.String("kubernetes.io/cluster/test-cluster"), + Value: aws.String("owned"), + }, + }, + }, + }, nil) + + // Mock DescribeSecurityGroups for rule references + mockedEC2.On("DescribeSecurityGroups", mock.MatchedBy(func(input *ec2.DescribeSecurityGroupsInput) bool { + return len(input.Filters) > 0 + })).Return([]ec2types.SecurityGroup{ + { + GroupId: aws.String("sg-ref1"), + Tags: []ec2types.Tag{ + { + Key: aws.String("kubernetes.io/cluster/test-cluster"), + Value: aws.String("owned"), + }, + }, + IpPermissions: []ec2types.IpPermission{ + { + IpProtocol: aws.String("tcp"), + FromPort: aws.Int32(80), + ToPort: aws.Int32(80), + UserIdGroupPairs: []ec2types.UserIdGroupPair{ + { + GroupId: aws.String("sg-owned1"), + }, + }, + }, + }, + }, + }, nil) + + // Mock RevokeSecurityGroupIngress + mockedEC2.On("RevokeSecurityGroupIngress", mock.MatchedBy(func(input *ec2.RevokeSecurityGroupIngressInput) bool { + return aws.ToString(input.GroupId) == "sg-ref1" + })).Return(&ec2.RevokeSecurityGroupIngressOutput{}, nil) + + // Mock DeleteSecurityGroup + mockedEC2.On("DeleteSecurityGroup", mock.MatchedBy(func(input *ec2.DeleteSecurityGroupInput) bool { + return aws.ToString(input.GroupId) == "sg-owned1" || aws.ToString(input.GroupId) == "sg-owned2" + })).Return(&ec2.DeleteSecurityGroupOutput{}, nil) + }, + expectError: false, + expectRevokeCallCount: 2, + expectDeleteCallCount: 2, + }, + { + name: "skip non-owned security groups", + securityGroups: []string{"sg-not-owned1", "sg-not-owned2"}, + setupMocks: func(mockedEC2 *MockedFakeEC2) { + // Mock DescribeSecurityGroups for ownership check - return non-owned + mockedEC2.On("DescribeSecurityGroups", mock.MatchedBy(func(input *ec2.DescribeSecurityGroupsInput) bool { + return len(input.GroupIds) == 1 && input.GroupIds[0] == "sg-not-owned1" + })).Return([]ec2types.SecurityGroup{ + { + GroupId: aws.String("sg-not-owned1"), + Tags: []ec2types.Tag{ + { + Key: aws.String("some-other-tag"), + Value: aws.String("some-value"), + }, + }, + }, + }, nil) + + mockedEC2.On("DescribeSecurityGroups", mock.MatchedBy(func(input *ec2.DescribeSecurityGroupsInput) bool { + return len(input.GroupIds) == 1 && input.GroupIds[0] == "sg-not-owned2" + })).Return([]ec2types.SecurityGroup{ + { + GroupId: aws.String("sg-not-owned2"), + Tags: []ec2types.Tag{ + { + Key: aws.String("another-tag"), + Value: aws.String("another-value"), + }, + }, + }, + }, nil) + + // Mock DescribeSecurityGroups for rule references for sg-not-owned1 + mockedEC2.On("DescribeSecurityGroups", mock.MatchedBy(func(input *ec2.DescribeSecurityGroupsInput) bool { + return len(input.Filters) > 0 && len(input.Filters[0].Values) > 0 && input.Filters[0].Values[0] == "sg-not-owned1" + })).Return([]ec2types.SecurityGroup{ + { + GroupId: aws.String("sg-ref1"), + Tags: []ec2types.Tag{ + { + Key: aws.String("kubernetes.io/cluster/test-cluster"), + Value: aws.String("owned"), + }, + }, + IpPermissions: []ec2types.IpPermission{ + { + IpProtocol: aws.String("tcp"), + FromPort: aws.Int32(80), + ToPort: aws.Int32(80), + UserIdGroupPairs: []ec2types.UserIdGroupPair{ + { + GroupId: aws.String("sg-not-owned1"), + }, + }, + }, + }, + }, + }, nil) + + // Mock DescribeSecurityGroups for rule references for sg-not-owned2 + mockedEC2.On("DescribeSecurityGroups", mock.MatchedBy(func(input *ec2.DescribeSecurityGroupsInput) bool { + return len(input.Filters) > 0 && len(input.Filters[0].Values) > 0 && input.Filters[0].Values[0] == "sg-not-owned2" + })).Return([]ec2types.SecurityGroup{ + { + GroupId: aws.String("sg-ref2"), + Tags: []ec2types.Tag{ + { + Key: aws.String("kubernetes.io/cluster/test-cluster"), + Value: aws.String("owned"), + }, + }, + IpPermissions: []ec2types.IpPermission{ + { + IpProtocol: aws.String("tcp"), + FromPort: aws.Int32(443), + ToPort: aws.Int32(443), + UserIdGroupPairs: []ec2types.UserIdGroupPair{ + { + GroupId: aws.String("sg-not-owned2"), + }, + }, + }, + }, + }, + }, nil) + + // Mock RevokeSecurityGroupIngress for sg-ref1 + mockedEC2.On("RevokeSecurityGroupIngress", mock.MatchedBy(func(input *ec2.RevokeSecurityGroupIngressInput) bool { + return aws.ToString(input.GroupId) == "sg-ref1" + })).Return(&ec2.RevokeSecurityGroupIngressOutput{}, nil) + + // Mock RevokeSecurityGroupIngress for sg-ref2 + mockedEC2.On("RevokeSecurityGroupIngress", mock.MatchedBy(func(input *ec2.RevokeSecurityGroupIngressInput) bool { + return aws.ToString(input.GroupId) == "sg-ref2" + })).Return(&ec2.RevokeSecurityGroupIngressOutput{}, nil) + + // DeleteSecurityGroup should NOT be called for non-owned groups + }, + expectError: false, + expectRevokeCallCount: 2, + expectDeleteCallCount: 0, + }, + { + name: "error checking ownership", + securityGroups: []string{"sg-error"}, + setupMocks: func(mockedEC2 *MockedFakeEC2) { + // Mock DescribeSecurityGroups to return error + mockedEC2.On("DescribeSecurityGroups", mock.MatchedBy(func(input *ec2.DescribeSecurityGroupsInput) bool { + return len(input.GroupIds) == 1 && input.GroupIds[0] == "sg-error" + })).Return([]ec2types.SecurityGroup(nil), errors.New("AWS error")) + }, + expectError: true, // Function should return error when ownership check fails + expectRevokeCallCount: 0, + expectDeleteCallCount: 0, + }, + { + name: "empty security groups list", + securityGroups: []string{}, + setupMocks: func(mockedEC2 *MockedFakeEC2) { + // No mocks needed for empty list + }, + expectError: false, + expectRevokeCallCount: 0, + expectDeleteCallCount: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockedEC2 := &MockedFakeEC2{} + + // Setup mocks + tt.setupMocks(mockedEC2) + + cloud := &Cloud{ + ec2: mockedEC2, + tagging: awsTagging{ + ClusterID: "test-cluster", + }, + } + + ctx := context.Background() + errs := cloud.removeOwnedSecurityGroups(ctx, "test-lb", tt.securityGroups) + + if tt.expectError { + assert.NotEmpty(t, errs) + } else { + assert.Empty(t, errs) + } + + mockedEC2.AssertExpectations(t) + }) + } +} + // Unit test generated by Cursor AI func TestGetKeyValuePropertiesFromAnnotation_TargetGroupAttributes(t *testing.T) { tests := []struct { @@ -1921,3 +2151,201 @@ func TestCloud_reconcileTargetGroupsAttributes(t *testing.T) { }) } } + +func TestCloud_buildSecurityGroupRuleReferences(t *testing.T) { + tests := []struct { + name string + targetGroupID string + setupMocks func(*MockedFakeEC2) + expectError bool + expectedErrorContains string + expectedGroupsWithTagsCount int + expectedGroupsLinkedPermissionsCount int + additionalAssertions func(t *testing.T, groupsWithTags map[*ec2types.SecurityGroup]bool, groupsLinkedPermissions map[*ec2types.SecurityGroup]IPPermissionSet) + }{ + { + name: "success with cluster tagged security group and linked permissions", + targetGroupID: "sg-target", + setupMocks: func(mockedEC2 *MockedFakeEC2) { + mockedEC2.On("DescribeSecurityGroups", &ec2.DescribeSecurityGroupsInput{ + Filters: []ec2types.Filter{ + { + Name: aws.String("ip-permission.group-id"), + Values: []string{"sg-target"}, + }, + }, + }).Return([]ec2types.SecurityGroup{ + { + GroupId: aws.String("sg-owned"), + Tags: []ec2types.Tag{ + { + Key: aws.String("kubernetes.io/cluster/test-cluster"), + Value: aws.String("owned"), + }, + }, + IpPermissions: []ec2types.IpPermission{ + { + IpProtocol: aws.String("tcp"), + FromPort: aws.Int32(80), + ToPort: aws.Int32(80), + UserIdGroupPairs: []ec2types.UserIdGroupPair{ + { + GroupId: aws.String("sg-target"), + }, + }, + }, + }, + }, + }, nil) + }, + expectError: false, + expectedGroupsWithTagsCount: 1, + expectedGroupsLinkedPermissionsCount: 1, + additionalAssertions: func(t *testing.T, groupsWithTags map[*ec2types.SecurityGroup]bool, groupsLinkedPermissions map[*ec2types.SecurityGroup]IPPermissionSet) { + // Find the security group in the results + var foundSG *ec2types.SecurityGroup + for sg := range groupsWithTags { + if aws.ToString(sg.GroupId) == "sg-owned" { + foundSG = sg + break + } + } + require.NotNil(t, foundSG) + + // Check that the security group has cluster tags + assert.True(t, groupsWithTags[foundSG]) + + // Check that the security group has linked permissions + assert.Equal(t, 1, groupsLinkedPermissions[foundSG].Len()) + }, + }, + { + name: "success with non-cluster tagged security group", + targetGroupID: "sg-target", + setupMocks: func(mockedEC2 *MockedFakeEC2) { + mockedEC2.On("DescribeSecurityGroups", &ec2.DescribeSecurityGroupsInput{ + Filters: []ec2types.Filter{ + { + Name: aws.String("ip-permission.group-id"), + Values: []string{"sg-target"}, + }, + }, + }).Return([]ec2types.SecurityGroup{ + { + GroupId: aws.String("sg-unowned"), + Tags: []ec2types.Tag{ + { + Key: aws.String("some-other-tag"), + Value: aws.String("some-value"), + }, + }, + IpPermissions: []ec2types.IpPermission{ + { + IpProtocol: aws.String("tcp"), + FromPort: aws.Int32(80), + ToPort: aws.Int32(80), + UserIdGroupPairs: []ec2types.UserIdGroupPair{ + { + GroupId: aws.String("sg-target"), + }, + }, + }, + }, + }, + }, nil) + }, + expectError: false, + expectedGroupsWithTagsCount: 1, + expectedGroupsLinkedPermissionsCount: 1, + additionalAssertions: func(t *testing.T, groupsWithTags map[*ec2types.SecurityGroup]bool, groupsLinkedPermissions map[*ec2types.SecurityGroup]IPPermissionSet) { + // Find the security group in the linkedPermissions results + var foundSG *ec2types.SecurityGroup + for sg := range groupsLinkedPermissions { + if aws.ToString(sg.GroupId) == "sg-unowned" { + foundSG = sg + break + } + } + require.NotNil(t, foundSG) + + // Check that the security group is in groupsWithTags but not cluster tagged + _, exists := groupsWithTags[foundSG] + assert.True(t, exists) + + // Check that the security group is not cluster tagged + assert.False(t, groupsWithTags[foundSG]) + }, + }, + { + name: "error when DescribeSecurityGroups fails", + targetGroupID: "sg-target", + setupMocks: func(mockedEC2 *MockedFakeEC2) { + mockedEC2.On("DescribeSecurityGroups", &ec2.DescribeSecurityGroupsInput{ + Filters: []ec2types.Filter{ + { + Name: aws.String("ip-permission.group-id"), + Values: []string{"sg-target"}, + }, + }, + }).Return([]ec2types.SecurityGroup{}, errors.New("AWS API error")) + }, + expectError: true, + expectedErrorContains: "error querying security groups for ELB", + expectedGroupsWithTagsCount: 0, + expectedGroupsLinkedPermissionsCount: 0, + }, + { + name: "success with no security groups found", + targetGroupID: "sg-target", + setupMocks: func(mockedEC2 *MockedFakeEC2) { + mockedEC2.On("DescribeSecurityGroups", &ec2.DescribeSecurityGroupsInput{ + Filters: []ec2types.Filter{ + { + Name: aws.String("ip-permission.group-id"), + Values: []string{"sg-target"}, + }, + }, + }).Return([]ec2types.SecurityGroup{}, nil) + }, + expectError: false, + expectedGroupsWithTagsCount: 0, + expectedGroupsLinkedPermissionsCount: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockedEC2 := &MockedFakeEC2{} + tt.setupMocks(mockedEC2) + + c := &Cloud{ + ec2: mockedEC2, + region: "us-west-2", + tagging: awsTagging{ + ClusterID: "test-cluster", + }, + } + + ctx := context.TODO() + groupsWithTags, groupsLinkedPermissions, err := c.buildSecurityGroupRuleReferences(ctx, tt.targetGroupID) + + if tt.expectError { + require.Error(t, err) + if tt.expectedErrorContains != "" { + assert.Contains(t, err.Error(), tt.expectedErrorContains) + } + } else { + require.NoError(t, err) + } + + assert.Len(t, groupsWithTags, tt.expectedGroupsWithTagsCount) + assert.Len(t, groupsLinkedPermissions, tt.expectedGroupsLinkedPermissionsCount) + + if tt.additionalAssertions != nil { + tt.additionalAssertions(t, groupsWithTags, groupsLinkedPermissions) + } + + mockedEC2.AssertExpectations(t) + }) + } +} diff --git a/pkg/providers/v1/aws_test.go b/pkg/providers/v1/aws_test.go index 3188b24b03..afb5f8e921 100644 --- a/pkg/providers/v1/aws_test.go +++ b/pkg/providers/v1/aws_test.go @@ -69,7 +69,7 @@ func (m *MockedFakeEC2) expectDescribeSecurityGroups(clusterID, groupName string m.On("DescribeSecurityGroups", &ec2.DescribeSecurityGroupsInput{Filters: []ec2types.Filter{ newEc2Filter("group-name", groupName), newEc2Filter("vpc-id", ""), - }}).Return([]ec2types.SecurityGroup{{Tags: tags}}) + }}).Return([]ec2types.SecurityGroup{{Tags: tags}}, nil) } func (m *MockedFakeEC2) expectDescribeSecurityGroupsAll(clusterID string) { @@ -81,7 +81,7 @@ func (m *MockedFakeEC2) expectDescribeSecurityGroupsAll(clusterID string) { m.On("DescribeSecurityGroups", &ec2.DescribeSecurityGroupsInput{}).Return([]ec2types.SecurityGroup{{ GroupId: aws.String("sg-123456"), Tags: tags, - }}) + }}, nil) } func (m *MockedFakeEC2) expectDescribeSecurityGroupsByFilter(clusterID, filterName string, filterValues ...string) { @@ -92,7 +92,7 @@ func (m *MockedFakeEC2) expectDescribeSecurityGroupsByFilter(clusterID, filterNa m.On("DescribeSecurityGroups", &ec2.DescribeSecurityGroupsInput{Filters: []ec2types.Filter{ newEc2Filter(filterName, filterValues...), - }}).Return([]ec2types.SecurityGroup{{Tags: tags}}) + }}).Return([]ec2types.SecurityGroup{{Tags: tags}}, nil) } func (m *MockedFakeEC2) RevokeSecurityGroupIngress(ctx context.Context, request *ec2.RevokeSecurityGroupIngressInput, optFns ...func(*ec2.Options)) (*ec2.RevokeSecurityGroupIngressOutput, error) { @@ -112,10 +112,11 @@ func (m *MockedFakeEC2) DescribeSecurityGroups(ctx context.Context, request *ec2 return []ec2types.SecurityGroup{}, nil } args := m.Called(request) - if len(args) > 1 { - return args.Get(0).([]ec2types.SecurityGroup), args.Error(1) + // Handle case where no expectations are set up (returns default values) + if len(args) < 2 { + return args.Get(0).([]ec2types.SecurityGroup), nil } - return args.Get(0).([]ec2types.SecurityGroup), nil + return args.Get(0).([]ec2types.SecurityGroup), args.Error(1) } func (m *MockedFakeEC2) CreateSecurityGroup(ctx context.Context, request *ec2.CreateSecurityGroupInput, optFns ...func(*ec2.Options)) (*ec2.CreateSecurityGroupOutput, error) { diff --git a/pkg/providers/v1/tags_test.go b/pkg/providers/v1/tags_test.go index b550a03cf4..6565d787ca 100644 --- a/pkg/providers/v1/tags_test.go +++ b/pkg/providers/v1/tags_test.go @@ -605,13 +605,15 @@ func TestHasClusterTagOwned(t *testing.T) { } result, err := tagging.hasClusterTagOwned(tt.tags) + if tt.expectedError != "" { - assert.Error(t, err) - assert.Contains(t, err.Error(), tt.expectedError) + assert.Error(t, err, "Expected error for test case: %s", tt.name) + assert.Contains(t, err.Error(), tt.expectedError, "Error message should contain expected text") + assert.Equal(t, tt.expected, result, "Result should match expected value even when error occurs") } else { - assert.NoError(t, err) + assert.NoError(t, err, "Expected no error for test case: %s", tt.name) + assert.Equal(t, tt.expected, result, "hasClusterTagOwned returned unexpected result") } - assert.Equal(t, tt.expected, result, "hasClusterTagOwned returned unexpected result") }) } } From e98e82625132dd056ed9124e38d62eed4b36ac2d Mon Sep 17 00:00:00 2001 From: Marco Braga Date: Fri, 1 Aug 2025 17:43:04 -0300 Subject: [PATCH 3/3] fix/byosg/e2e: introduce scenario BYO SG updates to CLB Introduce BYO Security Group(SG) update scenario to Service CLB to validate SG leak when user has created a Service CLB with default SG and eventually updated to a user-provided. https://github.com/kubernetes/cloud-provider-aws/issues/1208 --- tests/e2e/aws_helper.go | 413 ++++++++++++++++++++++++++++ tests/e2e/go.mod | 14 +- tests/e2e/go.sum | 28 +- tests/e2e/loadbalancer.go | 547 ++++++++++++++++++++++++++++++++------ 4 files changed, 909 insertions(+), 93 deletions(-) create mode 100644 tests/e2e/aws_helper.go diff --git a/tests/e2e/aws_helper.go b/tests/e2e/aws_helper.go new file mode 100644 index 0000000000..8604b79dcb --- /dev/null +++ b/tests/e2e/aws_helper.go @@ -0,0 +1,413 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package e2e + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/service/ec2" + + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + elb "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing" + elbv2 "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/kubernetes/test/e2e/framework" +) + +// awsHelper provides AWS API operations for e2e tests +type awsHelper struct { + ctx context.Context + ec2Client *ec2.Client + elbClient *elb.Client + elbv2Client *elbv2.Client + + // Cluster information + clusterName string + clusterTag string + clusterTagValue string + vpcID string + awsRegion string +} + +// newAWSHelper creates a new AWS helper with configured clients +func newAWSHelper(ctx context.Context, cs clientset.Interface) (*awsHelper, error) { + cfg, err := config.LoadDefaultConfig(ctx) + framework.ExpectNoError(err, "unable to load AWS config") + + h := &awsHelper{ + ctx: ctx, + ec2Client: ec2.NewFromConfig(cfg), + elbClient: elb.NewFromConfig(cfg), + elbv2Client: elbv2.NewFromConfig(cfg), + } + + framework.Logf("Discovering cluster tag") + framework.ExpectNoError(h.discoverClusterTag(cs), "unable to find cluster tag") + framework.Logf("Cluster tag discovered: %s", h.clusterTag) + + return h, nil +} + +// discoverClusterTag discovers the cluster tag from a cluster. +// The discover is done by looking up the EC2 instance tags with tag:Name prefix kubernetes.io/cluster. +// The EC2 Instance ID is discovered from a cluster node object. +// The cluster ID, VPC ID and cluster tag are discovered from the EC2 instance tags. +// If is any error is found, the function returns an error. +func (h *awsHelper) discoverClusterTag(cs clientset.Interface) error { + nodes, err := cs.CoreV1().Nodes().List(h.ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to list nodes: %v", err) + } + + var instanceID string + for _, node := range nodes.Items { + providerID := node.Spec.ProviderID + if providerID == "" { + framework.Logf("providerID %s is empty for node %s", providerID, node.Name) + continue + } + providerID = strings.Replace(providerID, "aws:///", "", 1) + if len(strings.Split(providerID, "/")) < 2 { + framework.Logf("providerID %s is not a valid EC2 instance ID", providerID) + continue + } + h.awsRegion = strings.Split(providerID, "/")[0] + instanceID = strings.Split(providerID, "/")[1] + if !strings.HasPrefix(instanceID, "i-") { + framework.Logf("instanceID %s is not a valid EC2 instance ID", instanceID) + continue + } + break + } + + instance, err := h.ec2Client.DescribeInstances(h.ctx, &ec2.DescribeInstancesInput{ + InstanceIds: []string{instanceID}, + }) + if err != nil { + return fmt.Errorf("failed to describe instances: %v", err) + } + + clusterTagFound := false + for _, reservation := range instance.Reservations { + for _, tag := range reservation.Instances[0].Tags { + if strings.HasPrefix(aws.ToString(tag.Key), "kubernetes.io/cluster") { + h.clusterTag = aws.ToString(tag.Key) + h.clusterTagValue = aws.ToString(tag.Value) + clusterTagFound = true + break + } + } + if clusterTagFound { + break + } + } + + if !clusterTagFound { + return fmt.Errorf("cluster tag not found in the instance %s", instanceID) + } + + h.clusterName = strings.Split(h.clusterTag, "/")[2] + if h.clusterName == "" { + return fmt.Errorf("cluster name not found in the cluster tag %s", h.clusterTag) + } + + // extract VPC ID from the Instance + for _, networkInterface := range instance.Reservations[0].Instances[0].NetworkInterfaces { + h.vpcID = aws.ToString(networkInterface.VpcId) + break + } + + if h.vpcID == "" { + return fmt.Errorf("VPC ID not found in the instance %s", instanceID) + } + + return nil +} + +// getLoadBalancerSecurityGroups gets security groups attached to a load balancer +func (h *awsHelper) getLoadBalancerSecurityGroups(isNLB bool, lbDNSName string) ([]string, error) { + if isNLB { + if h.elbv2Client == nil { + return nil, fmt.Errorf("elbv2Client is not initialized") + } + describeNLBs, err := h.elbv2Client.DescribeLoadBalancers(h.ctx, &elbv2.DescribeLoadBalancersInput{}) + framework.ExpectNoError(err, "failed to describe load balancers to retrieve security groups") + + for _, lb := range describeNLBs.LoadBalancers { + if strings.EqualFold(aws.ToString(lb.DNSName), lbDNSName) { + return lb.SecurityGroups, nil + } + } + return nil, fmt.Errorf("load balancer with DNS %s not found", lbDNSName) + } + + // Get CLB ARN from DNS name + if h.elbClient == nil { + return nil, fmt.Errorf("elbClient is not initialized") + } + describeCLBs, err := h.elbClient.DescribeLoadBalancers(h.ctx, &elb.DescribeLoadBalancersInput{}) + framework.ExpectNoError(err, "failed to describe load balancers to retrieve security groups") + + for _, lb := range describeCLBs.LoadBalancerDescriptions { + if strings.EqualFold(aws.ToString(lb.DNSName), lbDNSName) { + return lb.SecurityGroups, nil + } + } + return nil, fmt.Errorf("load balancer with DNS %s not found", lbDNSName) +} + +// isSecurityGroupManaged checks if a security group is managed by the controller +// It checks for the cluster ownership tag to determine if the controller owns this security group +func (h *awsHelper) isSecurityGroupManaged(sgID string) (bool, error) { + sg, err := h.getSecurityGroup(sgID) + if err != nil { + return false, err + } + + // Check for cluster ownership tag - security groups owned by the controller + // have the cluster tag with "owned" value + clusterTagKey := fmt.Sprintf("kubernetes.io/cluster/%s", h.clusterName) + for _, tag := range sg.Tags { + if aws.ToString(tag.Key) == clusterTagKey && + aws.ToString(tag.Value) == "owned" { + return true, nil + } + } + return false, nil +} + +// getSecurityGroup gets a security group by ID +func (h *awsHelper) getSecurityGroup(sgID string) (*ec2types.SecurityGroup, error) { + if h.ec2Client == nil { + return nil, fmt.Errorf("ec2Client is not initialized") + } + result, err := h.ec2Client.DescribeSecurityGroups(h.ctx, &ec2.DescribeSecurityGroupsInput{ + GroupIds: []string{sgID}, + }) + if err != nil { + return nil, fmt.Errorf("unable to describe security group %q: %v", sgID, err) + } + if len(result.SecurityGroups) == 0 { + return nil, fmt.Errorf("security group %s not found", sgID) + } + return &result.SecurityGroups[0], nil +} + +// createSecurityGroup creates a new security group for testing purposes +func (h *awsHelper) createSecurityGroup(name, description string) (string, error) { + result, err := h.ec2Client.CreateSecurityGroup(h.ctx, &ec2.CreateSecurityGroupInput{ + GroupName: aws.String(name), + Description: aws.String(description), + TagSpecifications: []ec2types.TagSpecification{ + { + ResourceType: ec2types.ResourceTypeSecurityGroup, + Tags: []ec2types.Tag{ + { + Key: aws.String("Name"), + Value: aws.String(name), + }, + { + Key: aws.String(fmt.Sprintf("kubernetes.io/cluster/%s", h.clusterName)), + Value: aws.String("shared"), + }, + }, + }, + }, + VpcId: aws.String(h.vpcID), + }) + if err != nil { + return "", fmt.Errorf("failed to create security group: %v", err) + } + + return aws.ToString(result.GroupId), nil +} + +// cleanup cleans up the e2e resources. +func (e *e2eTestConfig) cleanup() { + framework.Logf("Cleaning up e2e resources") + + // Cleanup security group + if e.awsHelper != nil && e.byoSecurityGroupID != "" { + framework.Logf("Cleaning up security group %s", e.byoSecurityGroupID) + err := e.awsHelper.waitForSecurityGroupDeletion(e.byoSecurityGroupID, 5*time.Minute) + if err != nil { + framework.Logf("Failed to delete security group %s during cleanup: %v", e.byoSecurityGroupID, err) + } else { + framework.Logf("Successfully cleaned up security group %s", e.byoSecurityGroupID) + } + } +} + +// authorizeSecurityGroupToPorts authorizes a security group to allow traffic to the service ports +func (h *awsHelper) authorizeSecurityGroupToPorts(sgID string, ports []v1.ServicePort) error { + if h.ec2Client == nil { + return fmt.Errorf("ec2Client is not initialized") + } + + if len(ports) == 0 { + framework.Logf("No ports to authorize for security group %s", sgID) + return nil + } + + framework.Logf("Authorizing security group %s for %d ports", sgID, len(ports)) + + ingressRules := make([]ec2types.IpPermission, 0, len(ports)) + for _, port := range ports { + protocol := strings.ToLower(string(port.Protocol)) + rule := ec2types.IpPermission{ + FromPort: aws.Int32(int32(port.Port)), + ToPort: aws.Int32(int32(port.Port)), + IpProtocol: aws.String(protocol), + IpRanges: []ec2types.IpRange{ + { + CidrIp: aws.String("0.0.0.0/0"), + Description: aws.String(fmt.Sprintf("E2E test access for port %d", port.Port)), + }, + }, + } + ingressRules = append(ingressRules, rule) + framework.Logf("Adding rule: protocol=%s, port=%d, source=0.0.0.0/0", protocol, port.Port) + } + + framework.Logf("Calling AWS AuthorizeSecurityGroupIngress for security group %s with %d rules", sgID, len(ingressRules)) + _, err := h.ec2Client.AuthorizeSecurityGroupIngress(h.ctx, &ec2.AuthorizeSecurityGroupIngressInput{ + GroupId: aws.String(sgID), + IpPermissions: ingressRules, + }) + if err != nil { + // Check if error is due to duplicate rules (which is actually okay) + if strings.Contains(err.Error(), "InvalidPermission.Duplicate") { + framework.Logf("Some rules already exist in security group %s (this is okay): %v", sgID, err) + return nil + } + return fmt.Errorf("failed to authorize security group %s to ports %v: %v", sgID, ports, err) + } + + framework.Logf("Successfully authorized security group %s.", sgID) + return nil +} + +// verifySecurityGroupRules verifies that the expected rules exist in the security group +// This is helpful for debugging when rules don't appear to be created +func (h *awsHelper) verifySecurityGroupRules(sgID string, expectedPorts []v1.ServicePort) error { + if h.ec2Client == nil { + return fmt.Errorf("ec2Client is not initialized") + } + + sg, err := h.getSecurityGroup(sgID) + if err != nil { + return fmt.Errorf("failed to get security group %s: %v", sgID, err) + } + + framework.Logf("Security group %s has %d ingress rules:", sgID, len(sg.IpPermissions)) + for i, rule := range sg.IpPermissions { + protocol := aws.ToString(rule.IpProtocol) + fromPort := aws.ToInt32(rule.FromPort) + toPort := aws.ToInt32(rule.ToPort) + + var sources []string + for _, ipRange := range rule.IpRanges { + sources = append(sources, aws.ToString(ipRange.CidrIp)) + } + for _, sg := range rule.UserIdGroupPairs { + sources = append(sources, fmt.Sprintf("sg-%s", aws.ToString(sg.GroupId))) + } + + framework.Logf(" Rule %d: protocol=%s, ports=%d-%d, sources=%v", i+1, protocol, fromPort, toPort, sources) + } + + // Check if expected ports are covered + for _, expectedPort := range expectedPorts { + expectedProtocol := strings.ToLower(string(expectedPort.Protocol)) + expectedPortNum := int32(expectedPort.Port) + + found := false + for _, rule := range sg.IpPermissions { + ruleProtocol := aws.ToString(rule.IpProtocol) + fromPort := aws.ToInt32(rule.FromPort) + toPort := aws.ToInt32(rule.ToPort) + + if ruleProtocol == expectedProtocol && fromPort <= expectedPortNum && expectedPortNum <= toPort { + found = true + break + } + } + + if !found { + framework.Logf("WARNING: Expected rule for protocol=%s port=%d not found in security group %s", expectedProtocol, expectedPortNum, sgID) + } else { + framework.Logf("✓ Found rule for protocol=%s port=%d in security group %s", expectedProtocol, expectedPortNum, sgID) + } + } + + return nil +} + +// waitForSecurityGroupDeletion attempts to delete a security group and waits for it to be deleted +// It handles dependency violations when the SG is still attached to resources like load balancers +func (h *awsHelper) waitForSecurityGroupDeletion(sgID string, timeout time.Duration) error { + return wait.PollImmediate(1*time.Second, timeout, func() (bool, error) { + _, err := h.getSecurityGroup(sgID) + if err != nil { + framework.Logf("Security group %s appears to be deleted: %v", sgID, err) + return true, nil + } + + err = h.deleteSecurityGroup(sgID) + if err != nil { + // Check for dependency violation errors + if strings.Contains(err.Error(), "DependencyViolation") || + strings.Contains(err.Error(), "InvalidGroup.InUse") || + strings.Contains(err.Error(), "resource has a dependent object") { + framework.Logf("Security group %s still has dependencies, waiting... (%v)", sgID, err) + return false, nil // Keep waiting + } + + // Check if it's already deleted + if strings.Contains(err.Error(), "InvalidGroup.NotFound") || + strings.Contains(err.Error(), "InvalidGroupId.NotFound") { + framework.Logf("Security group %s is already deleted", sgID) + return true, nil + } + + // For other errors, return the error + return false, err + } + + framework.Logf("Successfully deleted security group %s", sgID) + return true, nil + }) +} + +// deleteSecurityGroup deletes a security group +func (h *awsHelper) deleteSecurityGroup(sgID string) error { + if _, err := h.ec2Client.DeleteSecurityGroup(h.ctx, &ec2.DeleteSecurityGroupInput{ + GroupId: aws.String(sgID), + }); err != nil { + return fmt.Errorf("failed to delete security group %s: %v", sgID, err) + } + + return nil +} diff --git a/tests/e2e/go.mod b/tests/e2e/go.mod index ed508ca523..28441086f4 100644 --- a/tests/e2e/go.mod +++ b/tests/e2e/go.mod @@ -3,8 +3,10 @@ module k8s.io/cloud-provider-aws/tests/e2e go 1.24.7 require ( - github.com/aws/aws-sdk-go-v2 v1.36.3 + github.com/aws/aws-sdk-go-v2 v1.37.1 github.com/aws/aws-sdk-go-v2/config v1.29.14 + github.com/aws/aws-sdk-go-v2/service/ec2 v1.239.0 + github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.30.1 github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.45.2 github.com/onsi/ginkgo/v2 v2.21.0 github.com/onsi/gomega v1.35.1 @@ -20,15 +22,15 @@ require ( github.com/antlr4-go/antlr/v4 v4.13.0 // indirect github.com/aws/aws-sdk-go-v2/credentials v1.17.67 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.1 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.1 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.0 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.1 // indirect github.com/aws/aws-sdk-go-v2/service/sso v1.25.3 // indirect github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.33.19 // indirect - github.com/aws/smithy-go v1.22.2 // indirect + github.com/aws/smithy-go v1.22.5 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect diff --git a/tests/e2e/go.sum b/tests/e2e/go.sum index 4bf609fe98..bc9cde48cf 100644 --- a/tests/e2e/go.sum +++ b/tests/e2e/go.sum @@ -4,34 +4,38 @@ github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= -github.com/aws/aws-sdk-go-v2 v1.36.3 h1:mJoei2CxPutQVxaATCzDUjcZEjVRdpsiiXi2o38yqWM= -github.com/aws/aws-sdk-go-v2 v1.36.3/go.mod h1:LLXuLpgzEbD766Z5ECcRmi8AzSwfZItDtmABVkRLGzg= +github.com/aws/aws-sdk-go-v2 v1.37.1 h1:SMUxeNz3Z6nqGsXv0JuJXc8w5YMtrQMuIBmDx//bBDY= +github.com/aws/aws-sdk-go-v2 v1.37.1/go.mod h1:9Q0OoGQoboYIAJyslFyF1f5K1Ryddop8gqMhWx/n4Wg= github.com/aws/aws-sdk-go-v2/config v1.29.14 h1:f+eEi/2cKCg9pqKBoAIwRGzVb70MRKqWX4dg1BDcSJM= github.com/aws/aws-sdk-go-v2/config v1.29.14/go.mod h1:wVPHWcIFv3WO89w0rE10gzf17ZYy+UVS1Geq8Iei34g= github.com/aws/aws-sdk-go-v2/credentials v1.17.67 h1:9KxtdcIA/5xPNQyZRgUSpYOE6j9Bc4+D7nZua0KGYOM= github.com/aws/aws-sdk-go-v2/credentials v1.17.67/go.mod h1:p3C44m+cfnbv763s52gCqrjaqyPikj9Sg47kUVaNZQQ= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 h1:x793wxmUWVDhshP8WW2mlnXuFrO4cOd3HLBroh1paFw= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30/go.mod h1:Jpne2tDnYiFascUEs2AWHJL9Yp7A5ZVy3TNyxaAjD6M= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34 h1:ZK5jHhnrioRkUNOc+hOgQKlUL5JeC3S6JgLxtQ+Rm0Q= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34/go.mod h1:p4VfIceZokChbA9FzMbRGz5OV+lekcVtHlPKEO0gSZY= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34 h1:SZwFm17ZUNNg5Np0ioo/gq8Mn6u9w19Mri8DnJ15Jf0= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34/go.mod h1:dFZsC0BLo346mvKQLWmoJxT+Sjp+qcVR1tRVHQGOH9Q= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.1 h1:ksZXBYv80EFTcgc8OJO48aQ8XDWXIQL7gGasPeCoTzI= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.1/go.mod h1:HSksQyyJETVZS7uM54cir0IgxttTD+8aEoJMPGepHBI= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.1 h1:+dn/xF/05utS7tUhjIcndbuaPjfll2LhbH1cCDGLYUQ= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.1/go.mod h1:hyAGz30LHdm5KBZDI58MXx5lDVZ5CUfvfTZvMu4HCZo= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 h1:bIqFDwgGXXN1Kpp99pDOdKMTTb5d2KyU5X/BZxjOkRo= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3/go.mod h1:H5O/EsxDWyU+LP/V8i5sm8cxoZgc2fdNR9bxlOFrQTo= +github.com/aws/aws-sdk-go-v2/service/ec2 v1.239.0 h1:pPuzRQQoRY7pwxlNf1//yz5goxB98p1KMa3cdBO+E1E= +github.com/aws/aws-sdk-go-v2/service/ec2 v1.239.0/go.mod h1:lhyI/MJGGbPnOdYmmQRZe07S+2fW2uWI1XrUfAZgXLM= +github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.30.1 h1:ZZeI9bCwIbqoKu5hll1dmisMJ4ZeBEhdsszV/gOFm1s= +github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.30.1/go.mod h1:fcC73gpU/J/SmRut8/CmwM5oJO3bSpW7wgufVdtWSbg= github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.45.2 h1:vX70Z4lNSr7XsioU0uJq5yvxgI50sB66MvD+V/3buS4= github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.45.2/go.mod h1:xnCC3vFBfOKpU6PcsCKL2ktgBTZfOwTGxj6V8/X3IS4= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 h1:eAh2A4b5IzM/lum78bZ590jy36+d/aFLgKF/4Vd1xPE= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3/go.mod h1:0yKJC/kb8sAnmlYa6Zs3QVYqaC8ug2AbnNChv5Ox3uA= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 h1:dM9/92u2F1JbDaGooxTq18wmmFzbJRfXfVfy96/1CXM= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15/go.mod h1:SwFBy2vjtA0vZbjjaFtfN045boopadnoVPhu4Fv66vY= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.0 h1:6+lZi2JeGKtCraAj1rpoZfKqnQ9SptseRZioejfUOLM= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.0/go.mod h1:eb3gfbVIxIoGgJsi9pGne19dhCBpK6opTYpQqAmdy44= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.1 h1:ky79ysLMxhwk5rxJtS+ILd3Mc8kC5fhsLBrP27r6h4I= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.1/go.mod h1:+2MmkvFvPYM1vsozBWduoLJUi5maxFk5B7KJFECujhY= github.com/aws/aws-sdk-go-v2/service/sso v1.25.3 h1:1Gw+9ajCV1jogloEv1RRnvfRFia2cL6c9cuKV2Ps+G8= github.com/aws/aws-sdk-go-v2/service/sso v1.25.3/go.mod h1:qs4a9T5EMLl/Cajiw2TcbNt2UNo/Hqlyp+GiuG4CFDI= github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1 h1:hXmVKytPfTy5axZ+fYbR5d0cFmC3JvwLm5kM83luako= github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1/go.mod h1:MlYRNmYu/fGPoxBQVvBYr9nyr948aY/WLUvwBMBJubs= github.com/aws/aws-sdk-go-v2/service/sts v1.33.19 h1:1XuUZ8mYJw9B6lzAkXhqHlJd/XvaX32evhproijJEZY= github.com/aws/aws-sdk-go-v2/service/sts v1.33.19/go.mod h1:cQnB8CUnxbMU82JvlqjKR2HBOm3fe9pWorWBza6MBJ4= -github.com/aws/smithy-go v1.22.2 h1:6D9hW43xKFrRx/tXXfAlIZc4JI+yQe6snnWcQyxSyLQ= -github.com/aws/smithy-go v1.22.2/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg= +github.com/aws/smithy-go v1.22.5 h1:P9ATCXPMb2mPjYBgueqJNCA5S9UfktsW0tTxi+a7eqw= +github.com/aws/smithy-go v1.22.5/go.mod h1:t1ufH5HMublsJYulve2RKmHDC15xu1f26kHCp/HgceI= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= diff --git a/tests/e2e/loadbalancer.go b/tests/e2e/loadbalancer.go index f008a0bb97..9067e42e3a 100644 --- a/tests/e2e/loadbalancer.go +++ b/tests/e2e/loadbalancer.go @@ -37,6 +37,7 @@ import ( "github.com/aws/aws-sdk-go-v2/config" elbv2 "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2" elbv2types "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2/types" + "k8s.io/apimachinery/pkg/api/errors" ) const ( @@ -44,6 +45,7 @@ const ( annotationLBInternal = "service.beta.kubernetes.io/aws-load-balancer-internal" annotationLBTargetNodeLabels = "service.beta.kubernetes.io/aws-load-balancer-target-node-labels" annotationLBTargetGroupAttributes = "service.beta.kubernetes.io/aws-load-balancer-target-group-attributes" + annotationLBSecurityGroups = "service.beta.kubernetes.io/aws-load-balancer-security-groups" ) var ( @@ -246,6 +248,92 @@ var _ = Describe("[cloud-provider-aws-e2e] loadbalancer", func() { } }, }, + // BYO Security Group tests. + // The "CLB with managed security group mut update to BYO..." must validate the features: + // - existing Service CLB with managed SG have correct tags + // - existing Service CLB with managed SG is updated to BYO SG (user-provided) through annotation + // - controller removes the managed SG when BYO SG is applied + // - load balancer is reachable after the update + { + name: "CLB with managed Security Group must update to BYO Security Group", + resourceSuffix: "clb-sg", + listenerCount: 1, + hookPreTest: func(cfg *e2eTestConfig) { + framework.Logf("running hook post-service-config patching service annotation with BYO security group") + isNLB := false + lbDNS := cfg.svc.Status.LoadBalancer.Ingress[0].Hostname + + framework.Logf("Getting load balancer security groups with address %s", lbDNS) + managedSecurityGroups, err := cfg.awsHelper.getLoadBalancerSecurityGroups(isNLB, lbDNS) + framework.ExpectNoError(err, "Failed to get load balancer security groups") + framework.Logf("Load balancer %s has security groups: %+v", lbDNS, managedSecurityGroups) + + framework.Logf("Checking if managed SGs are owned by the controller") + for _, sgID := range managedSecurityGroups { + managed, err := cfg.awsHelper.isSecurityGroupManaged(sgID) + framework.ExpectNoError(err, fmt.Sprintf("Failed to check if security group %q is managed", sgID)) + if !managed { + framework.Failf("Security group %q is not managed by the controller", sgID) + } + framework.Logf("Security group %q is managed: %t", sgID, managed) + } + + securityGroupName := cfg.svc.Namespace + "-" + cfg.svc.Name + "-sg-byo" + framework.Logf("Creating BYO SG with name %q", securityGroupName) + cfg.byoSecurityGroupID, err = cfg.awsHelper.createSecurityGroup(securityGroupName, fmt.Sprintf("BYO Security Group for e2e test service %s/%s", cfg.svc.Namespace, cfg.svc.Name)) + framework.ExpectNoError(err, "Failed to create BYO security group") + framework.Logf("BYO SG %q created with ID %q", securityGroupName, cfg.byoSecurityGroupID) + + // Currently controller does not update rules for BYO SG. + // TODO: Verify if controller needs to update rules for BYO SG. + framework.Logf("Authorizing BYO SG %q to service ports: %+v", cfg.byoSecurityGroupID, cfg.svc.Spec.Ports) + framework.ExpectNoError(cfg.awsHelper.authorizeSecurityGroupToPorts(cfg.byoSecurityGroupID, cfg.svc.Spec.Ports), "Failed to authorize BYO security group to service ports") + framework.Logf("BYO SG %q authorized to service ports", cfg.byoSecurityGroupID) + + // Verify the rules were actually created + framework.Logf("Verifying security group rules were created for BYO SG %q", cfg.byoSecurityGroupID) + framework.ExpectNoError(cfg.awsHelper.verifySecurityGroupRules(cfg.byoSecurityGroupID, cfg.svc.Spec.Ports), "Failed to verify BYO security group rules") + + framework.Logf("Patching Service %q with BYO SG %q", cfg.svc.Name, cfg.byoSecurityGroupID) + cfg.svc.Annotations[annotationLBSecurityGroups] = cfg.byoSecurityGroupID + newSvc, err := cfg.kubeClient.CoreV1().Services(cfg.LBJig.Namespace).Update(cfg.ctx, cfg.svc, metav1.UpdateOptions{}) + framework.ExpectNoError(err, "Failed to update Kubernetes Service") + cfg.svc = newSvc + + framework.Logf("Waiting for load balancer to be updated") + time.Sleep(10 * time.Second) + + framework.Logf("Getting load balancer security groups after update") + byoSecurityGroups, err := cfg.awsHelper.getLoadBalancerSecurityGroups(isNLB, lbDNS) + framework.ExpectNoError(err, "Failed to get load balancer security groups") + + framework.Logf("Load balancer %s has security groups: %+v", lbDNS, byoSecurityGroups) + framework.Logf("Checking if LB is using BYO SG %q", cfg.byoSecurityGroupID) + for _, sgID := range byoSecurityGroups { + if sgID == cfg.byoSecurityGroupID { + framework.Logf("Load balancer %s has BYO security group %q", lbDNS, sgID) + break + } + framework.Failf("Load balancer %s has different security group than expected. Want=%q got=%q", lbDNS, cfg.byoSecurityGroupID, sgID) + } + + framework.Logf("Checking if managed SGs were removed") + for _, sgID := range managedSecurityGroups { + framework.Logf("Checking if managed SG %q is removed by controller", sgID) + + sg, err := cfg.awsHelper.getSecurityGroup(sgID) + if err != nil && strings.Contains(err.Error(), "InvalidGroup.NotFound") { + framework.Logf("Managed security group %q removed", sgID) + break + } + if sg != nil { + framework.Failf("expected managed security group %q removed by controller, got %q", sgID, aws.ToString(sg.GroupId)) + } + framework.Failf("managed security group %q was not removed by controller: %v", sgID, err) + } + framework.Logf("pre-test hook completed") + }, + }, } serviceNameBase := "lbconfig-test" @@ -254,6 +342,8 @@ var _ = Describe("[cloud-provider-aws-e2e] loadbalancer", func() { By("setting up test environment and discovering worker nodes") e2e := newE2eTestConfig(cs) e2e.discoverClusterWorkerNode() + defer e2e.cleanup() + framework.Logf("[SETUP] Test case: %s", tc.name) framework.Logf("[SETUP] Worker nodes discovered: %d nodes, selector: %s, sample node: %s", e2e.nodeCount, e2e.nodeSelector, e2e.nodeSingleSample) @@ -376,11 +466,11 @@ var _ = Describe("[cloud-provider-aws-e2e] loadbalancer", func() { if tc.overrideTestRunInClusterReachableHTTP { By("testing HTTP connectivity for internal load balancer") framework.Logf("[TEST] Running internal connectivity test from node: %s", e2e.nodeSingleSample) - err := inClusterTestReachableHTTP(cs, ns.Name, e2e.nodeSingleSample, ingressAddress, svcPort) + err := e2e.inClusterTestReachableHTTP(ingressAddress, svcPort) if err != nil && tc.skipTestFailure { Skip(err.Error()) } - framework.ExpectNoError(err) + framework.ExpectNoError(err, "Failed to test HTTP connectivity from internal network") } else { By("testing HTTP connectivity for external/internet-facing load balancer") framework.Logf("[TEST] Running external connectivity test to %s:%d", ingressAddress, svcPort) @@ -393,13 +483,13 @@ var _ = Describe("[cloud-provider-aws-e2e] loadbalancer", func() { _, err = e2e.LBJig.UpdateService(ctx, func(s *v1.Service) { s.Spec.Type = v1.ServiceTypeClusterIP }) - framework.ExpectNoError(err) + framework.ExpectNoError(err, "Failed to update service to ClusterIP") // Wait for the load balancer to be destroyed asynchronously By("cleaning up: waiting for load balancer destruction") framework.Logf("[CLEANUP] Waiting for load balancer destruction") _, err = e2e.LBJig.WaitForLoadBalancerDestroy(ctx, ingressAddress, svcPort, loadBalancerCreateTimeout) - framework.ExpectNoError(err) + framework.ExpectNoError(err, "Failed to wait for load balancer destruction") framework.Logf("[CLEANUP] Load balancer destroyed successfully") }) } @@ -409,6 +499,11 @@ type e2eTestConfig struct { ctx context.Context kubeClient clientset.Interface + // AWS helper + awsHelper *awsHelper + + byoSecurityGroupID string + // service configuration cfgPortCount int cfgPodPort uint16 @@ -431,6 +526,9 @@ func newE2eTestConfig(cs clientset.Interface) *e2eTestConfig { ctx, cancel := context.WithTimeout(context.Background(), 25*time.Minute) _ = cancel // We'll let the test framework handle cleanup + h, err := newAWSHelper(ctx, cs) + framework.ExpectNoError(err, "Failed to create AWS helper") + return &e2eTestConfig{ kubeClient: cs, cfgPortCount: 2, @@ -441,6 +539,7 @@ func newE2eTestConfig(cs clientset.Interface) *e2eTestConfig { "aws-load-balancer-backend-protocol": "http", "aws-load-balancer-ssl-ports": "https", }, + awsHelper: h, } } @@ -689,14 +788,12 @@ func getAWSLoadBalancerFromDNSName(ctx context.Context, elbClient *elbv2.Client, } // inClusterTestReachableHTTP creates a pod within the cluster to test HTTP connectivity to a target IP and port. -// It schedules the pod on the specified node using node affinity to test the hairpin scenario. -// The pod uses a curl-based container to perform the HTTP request and validates the response. -// The function waits for the pod to complete its execution and inspects its exit code to determine success or failure. +// It schedules a client pod on the specified node using node affinity to test the hairpin scenario. +// The client pod uses a curl-based container to perform the HTTP request to the target server (behind the load balancer) +// and validates the response. +// The function waits for the client pod to complete its execution and inspects its exit code to determine success or failure. // // Parameters: -// - cs: Kubernetes clientset interface used to interact with the cluster. -// - namespace: The namespace in which the test pod will be created. -// - nodeName: The name of the node where the test pod should be scheduled. // - target: The IP address or Hostname of the target HTTP server. // - targetPort: The port number of the target HTTP server. // @@ -704,22 +801,40 @@ func getAWSLoadBalancerFromDNSName(ctx context.Context, elbClient *elbv2.Client, // - error: Returns an error if the pod creation, execution, or cleanup fails, or if the HTTP test fails unexpectedly. // // Behavior: -// - The function creates a pod with a curl-based container to perform the HTTP request. -// - It configures the pod to run as a non-root user with security settings. -// - The pod is scheduled on the specified node using node affinity. -// - Logs are periodically collected during the pod's execution for troubleshooting. -// - Events are inspected if the pod remains in a pending state for too long. -// - The function waits for the pod to complete and inspects its exit code to determine success or failure. -// - If the pod fails, an error is returned. -// - The pod is cleaned up after the test completes. -func inClusterTestReachableHTTP(cs clientset.Interface, namespace, nodeName, target string, targetPort int) error { +// - The function creates a client pod with a curl-based container to perform the HTTP request. +// - The client pod is scheduled on the specified node using node affinity. +// - Logs are periodically collected during the client pod's execution for troubleshooting. +// - Events are inspected if the client pod remains in a pending state for too long. +// - The function waits for the client pod to complete and inspects its exit code to determine success or failure. +// +// Acknowledgement: +// Documentation generated by Cursor AI, reviewed by Human. +// Function generated by Human, reviewed and verbosity increased by Cursor AI. +func (e2e *e2eTestConfig) inClusterTestReachableHTTP(target string, targetPort int) error { podName := "http-test-pod" + // Enhanced curl configuration for better resilience + // Total timeout calculation: 15 retries * 30s delay + 10min curl max time = ~17.5 minutes + // This aligns with the 15-minute polling timeout below + curlArgs := []string{ + "--retry", "20", // Increase retries for new LBs + "--retry-delay", "30", // Longer delay for DNS propagation + "--retry-max-time", "600", // 10 minutes max for curl operations + "--retry-all-errors", // Retry on all errors including DNS + "--retry-connrefused", // Explicitly retry connection refused + "--connect-timeout", "30", // 30s connection timeout + "--max-time", "45", // 45s per individual request + "--trace-time", // Include timestamps for debugging + "--verbose", // More detailed output for troubleshooting + "-w", "\"\\nCURL_SUMMARY: HTTPCode=%{http_code} Time=%{time_total}s ConnectTime=%{time_connect}s DNSTime=%{time_namelookup}s\\n\"", + fmt.Sprintf("http://%s:%d/echo?msg=hello", target, targetPort), + } + // client http test (curl) pod spec. pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: podName, - Namespace: namespace, + Namespace: e2e.svc.Namespace, }, Spec: v1.PodSpec{ Containers: []v1.Container{ @@ -727,21 +842,20 @@ func inClusterTestReachableHTTP(cs clientset.Interface, namespace, nodeName, tar Name: "curl", Image: imageutils.GetE2EImage(imageutils.Agnhost), Command: []string{"curl"}, - Args: []string{ - "--retry", "15", // Retry up to 15 times in case of transient network issues. - "--retry-delay", "20", // Wait 20 seconds between retries. - "--retry-max-time", "480", // Maximum time for retries is 480 seconds. - "--retry-all-errors", // Retry on all errors, ensuring robustness against temporary failures. - "--trace-time", // Include timestamps in trace output for debugging. - "-w", "\\\"\\n---> HTTPCode=%{http_code} Time=%{time_total}ms <---\\n\\\"", // Format output to include HTTP code and response time. - fmt.Sprintf("http://%s:%d/echo?msg=hello", target, targetPort), + Args: curlArgs, + SecurityContext: &v1.SecurityContext{ + AllowPrivilegeEscalation: aws.Bool(false), + Capabilities: &v1.Capabilities{ + Drop: []v1.Capability{"ALL"}, + }, + ReadOnlyRootFilesystem: aws.Bool(true), }, }, }, SecurityContext: &v1.PodSecurityContext{ - RunAsNonRoot: aws.Bool(true), // Ensures the pod runs as a non-root user for enhanced security. - RunAsUser: aws.Int64(1000), // Specifies the user ID for the container process. - RunAsGroup: aws.Int64(1000), // Specifies the group ID for the container process. + RunAsNonRoot: aws.Bool(true), + RunAsUser: aws.Int64(1000), + RunAsGroup: aws.Int64(1000), SeccompProfile: &v1.SeccompProfile{ Type: v1.SeccompProfileTypeRuntimeDefault, // Enforces runtime default seccomp profile for syscall filtering. }, @@ -756,7 +870,7 @@ func inClusterTestReachableHTTP(cs clientset.Interface, namespace, nodeName, tar { Key: "kubernetes.io/hostname", Operator: v1.NodeSelectorOpIn, - Values: []string{nodeName}, // Ensures the pod is scheduled on the specified node. + Values: []string{e2e.nodeSingleSample}, }, }, }, @@ -770,106 +884,241 @@ func inClusterTestReachableHTTP(cs clientset.Interface, namespace, nodeName, tar framework.Logf("In-Cluster test PodSpec Image=%v Command=%v Args=%v", ct.Image, ct.Command, ct.Args) // Create the pod - _, err := cs.CoreV1().Pods(namespace).Create(context.TODO(), pod, metav1.CreateOptions{}) + _, err := e2e.kubeClient.CoreV1().Pods(e2e.svc.Namespace).Create(e2e.ctx, pod, metav1.CreateOptions{}) if err != nil { return fmt.Errorf("failed to create HTTP test pod: %v", err) } // Clean up the pod defer func() { - err = cs.CoreV1().Pods(namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{}) + err = e2e.kubeClient.CoreV1().Pods(e2e.svc.Namespace).Delete(e2e.ctx, podName, metav1.DeleteOptions{}) if err != nil { framework.Logf("Failed to delete pod %s: %v", podName, err) } }() - // Pod logs wrapper. Collect recent logs, or all, from a test pod. - gatherLogs := func(tail int) string { - opts := &v1.PodLogOptions{} - if tail == 0 { - tail = 20 - } - opts.TailLines = aws.Int64(int64(tail)) - logs, errL := cs.CoreV1().Pods(namespace).GetLogs(podName, opts).DoRaw(context.TODO()) - if errL != nil { - framework.Logf("Failed to retrieve pod logs: %v", errL) - return "" - } - return string(logs) - } - - // Wait for the test pod to complete. Limit waiter be higher than curl retries. + // Wait for the test pod to complete. Align timeout with curl retry configuration + // Curl timeout: 20 retries * 30s delay + 600s max = ~1200s (~20 minutes) + // Pod polling timeout: 20 minutes + buffer = 22 minutes waitCount := 0 pendingCount := 0 - err = wait.PollImmediate(15*time.Second, 15*time.Minute, func() (bool, error) { - p, err := cs.CoreV1().Pods(namespace).Get(context.TODO(), podName, metav1.GetOptions{}) + consecutiveErrorCount := 0 + maxConsecutiveErrors := 3 + lastLoggedPhase := "" + podPollingTimeout := 22 * time.Minute + + framework.Logf("=== STARTING POD MONITORING ===") + framework.Logf("Pod polling timeout: %v (aligned with curl timeout)", podPollingTimeout) + + err = wait.PollImmediate(15*time.Second, podPollingTimeout, func() (bool, error) { + p, err := e2e.kubeClient.CoreV1().Pods(e2e.svc.Namespace).Get(e2e.ctx, podName, metav1.GetOptions{}) if err != nil { - framework.Logf("Error getting pod %s: %v", podName, err) + consecutiveErrorCount++ + framework.Logf("Error getting pod %s (attempt %d/%d): %v", podName, consecutiveErrorCount, maxConsecutiveErrors, err) + + // Debugging information for CI troubleshooting + if consecutiveErrorCount == 1 { + framework.Logf("=== CI Environment Debug Info ===") + framework.Logf("Namespace: %s, PodName: %s, NodeName: %s", e2e.svc.Namespace, podName, e2e.nodeSingleSample) + framework.Logf("Error type: %T", err) + framework.Logf("Error details: %v", err) + framework.Logf("API server connectivity issue detected in CI environment") + } + + // Check if this is a retriable error (API server issues, network problems, etc.) + if isRetriableKubernetesError(err) && consecutiveErrorCount < maxConsecutiveErrors { + framework.Logf("Treating as transient API server error, will retry in 15 seconds...") + return false, nil // Continue polling, don't fail immediately + } + + // If we've had too many consecutive errors or this is a non-retriable error, fail + framework.Logf("Permanent error or too many consecutive errors (%d), failing test", consecutiveErrorCount) return false, err } - framework.Logf("Pod %s status: Phase=%s", podName, p.Status.Phase) + + consecutiveErrorCount = 0 + + // Log phase changes + if string(p.Status.Phase) != lastLoggedPhase { + framework.Logf("Pod %s phase changed: %s -> %s", podName, lastLoggedPhase, p.Status.Phase) + lastLoggedPhase = string(p.Status.Phase) + } + podFinished := p.Status.Phase == v1.PodSucceeded || p.Status.Phase == v1.PodFailed + if p.Status.Phase == v1.PodFailed { + framework.Logf("Pod entered Failed state - performing detailed analysis:") + framework.Logf("%s", analyzePodFailure(p)) + framework.Logf("Recent logs from failed pod:\n%s", gatherPodLogs(e2e, podName, 50)) + } + // Troubleshoot pending pods if p.Status.Phase == v1.PodPending { pendingCount++ } if pendingCount%10 == 0 && pendingCount > 0 { framework.Logf("Pod %s is pending for too long, checking events...", podName) - events, errE := cs.CoreV1().Events(namespace).List(context.TODO(), metav1.ListOptions{ + + // Collect pod-specific events + events, errE := e2e.kubeClient.CoreV1().Events(e2e.svc.Namespace).List(e2e.ctx, metav1.ListOptions{ FieldSelector: fmt.Sprintf("involvedObject.name=%s", podName), }) if errE != nil { framework.Logf("Failed to list events for pod %s: %v", podName, errE) } else { + framework.Logf("Pod-specific events:") for _, event := range events.Items { - framework.Logf("Event: %s - %s", event.Reason, event.Message) + framework.Logf(" [%s] %s: %s (Count: %d)", + event.Type, event.Reason, event.Message, event.Count) } } + + // Collect node-level events if pod is scheduled + if p.Spec.NodeName != "" { + nodeEvents, errNE := e2e.kubeClient.CoreV1().Events(e2e.svc.Namespace).List(e2e.ctx, metav1.ListOptions{ + FieldSelector: fmt.Sprintf("involvedObject.name=%s", p.Spec.NodeName), + }) + if errNE != nil { + framework.Logf("Failed to list events for node %s: %v", p.Spec.NodeName, errNE) + } else if len(nodeEvents.Items) > 0 { + framework.Logf("Node %s recent events:", p.Spec.NodeName) + for _, event := range nodeEvents.Items { + framework.Logf(" [%s] %s: %s", event.Type, event.Reason, event.Message) + } + } + } + + framework.Logf("Preliminary analysis for pending pod:") + framework.Logf("%s", analyzePodFailure(p)) } // frequently collect logs. if waitCount > 0 && waitCount%4 == 0 { - framework.Logf("Tail logs for HTTP test pod:\n%s", gatherLogs(5)) + framework.Logf("Tail logs for HTTP test pod:\n%s", gatherPodLogs(e2e, podName, 5)) } if podFinished { - framework.Logf("Tail logs for HTTP test pod:\n%s", gatherLogs(0)) + framework.Logf("Tail logs for HTTP test pod:\n%s", gatherPodLogs(e2e, podName, 0)) } waitCount++ return podFinished, nil }) - // Check overall error if err != nil { return fmt.Errorf("error waiting for pod %s to complete: %v", podName, err) } // Inspect the pod's container status for exit code - pod, errS := cs.CoreV1().Pods(namespace).Get(context.TODO(), podName, metav1.GetOptions{}) + pod, errS := e2e.kubeClient.CoreV1().Pods(e2e.svc.Namespace).Get(e2e.ctx, podName, metav1.GetOptions{}) if errS != nil { return fmt.Errorf("failed to get pod %s: %v", podName, errS) } + + framework.Logf("=== FINAL POD STATUS ANALYSIS ===") + framework.Logf("Final pod phase: %s", pod.Status.Phase) + if len(pod.Status.ContainerStatuses) == 0 { - return fmt.Errorf("no container statuses found for pod %s", podName) + framework.Logf("WARNING: No container statuses found - this indicates a scheduling or node issue") + framework.Logf("%s", analyzePodFailure(pod)) + return fmt.Errorf("no container statuses found for pod %s - check pod failure analysis above", podName) } + containerStatus := pod.Status.ContainerStatuses[0] + framework.Logf("Container state analysis:") + framework.Logf(" Ready: %t", containerStatus.Ready) + framework.Logf(" Restart count: %d", containerStatus.RestartCount) + // Detailed termination analysis if containerStatus.State.Terminated != nil { - exitCode := containerStatus.State.Terminated.ExitCode + termination := containerStatus.State.Terminated + exitCode := termination.ExitCode + framework.Logf(" Termination reason: %s", termination.Reason) + framework.Logf(" Exit code: %d", exitCode) + framework.Logf(" Termination message: %s", termination.Message) + if exitCode != 0 { - errmsg := fmt.Errorf("pod %s exited with code %d", podName, exitCode) - framework.Logf("WARNING: %s.", errmsg.Error()) + // Gather comprehensive failure information + framework.Logf("=== CURL TEST FAILURE ANALYSIS ===") + framework.Logf("Exit code %d indicates curl command failed", exitCode) + framework.Logf("Common exit codes:") + framework.Logf(" 6: Couldn't resolve host") + framework.Logf(" 7: Failed to connect to host") + framework.Logf(" 28: Operation timeout") + framework.Logf(" 52: Empty reply from server") + framework.Logf(" 56: Failure in receiving network data") + + finalLogs := gatherPodLogs(e2e, podName, 0) + framework.Logf("Final container logs:\n%s", finalLogs) + + // Provide specific guidance based on exit code + var guidance string + switch exitCode { + case 6: + guidance = "DNS resolution failure - check if target hostname is resolvable" + case 7: + guidance = "Connection refused - check if target service is accessible and load balancer is working" + case 28: + guidance = "Timeout - check if target service is responding or increase curl timeout" + case 52: + guidance = "Empty reply - target service might be misconfigured or not running" + case 56: + guidance = "Network data receive failure - possible network connectivity issues" + default: + guidance = "Check curl logs above for specific error details" + } + + errmsg := fmt.Errorf("HTTP connectivity test failed: pod %s exited with code %d. Guidance: %s", podName, exitCode, guidance) + framework.Logf("CONNECTIVITY TEST RESULT: FAILED - %s", errmsg.Error()) return errmsg } + } else if containerStatus.State.Waiting != nil { + framework.Logf("Container still waiting: %s - %s", + containerStatus.State.Waiting.Reason, containerStatus.State.Waiting.Message) + framework.Logf("%s", analyzePodFailure(pod)) + return fmt.Errorf("pod %s container never started properly - check failure analysis above", podName) + } else if containerStatus.State.Running != nil { + framework.Logf("WARNING: Container still running - this shouldn't happen with RestartPolicy=Never") + return fmt.Errorf("pod %s container still running after timeout - unexpected state", podName) + } + + // Validate HTTP response format with enhanced checking + // Expected format: CURL_SUMMARY: HTTPCode=200 Time=