Skip to content

Commit 2371120

Browse files
Merge pull request #1267 from rikatz/rkatz/deflake-customendpoints
OCPBUGS-53432: deflake TestIngressControllerCustomEndpoints by proper waiting for CCM to be ready
2 parents b609eb2 + 66f8672 commit 2371120

File tree

2 files changed

+171
-69
lines changed

2 files changed

+171
-69
lines changed

test/e2e/operator_test.go

Lines changed: 83 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ var (
125125
operandNamespace = operatorcontroller.DefaultOperandNamespace
126126
defaultName = types.NamespacedName{Namespace: operatorNamespace, Name: manifests.DefaultIngressControllerName}
127127
clusterConfigName = types.NamespacedName{Namespace: operatorNamespace, Name: manifests.ClusterIngressConfigName}
128+
ccmDeploymentName = types.NamespacedName{Namespace: "openshift-cloud-controller-manager", Name: "aws-cloud-controller-manager"}
128129

129130
// Platforms that need a DNS "warmup" period for internal (inside the test cluster) DNS resolution.
130131
// The warmup period is a period of delay before the first query is executed to avoid negative caching.
@@ -142,6 +143,13 @@ const (
142143
// cluster (which may be on different platforms) and any negative caching along the way. As of writing this, AWS
143144
// typically resolves within ~1 minute (see OCPBUGS-14966), while IBMCloud takes ~7 minutes (see OCPBUGS-48780).
144145
dnsResolutionTimeout = 10 * time.Minute
146+
147+
// ccmConfigAnnotation is an annotation that exists on CloudControllerManager
148+
// and contains a hash of the current infrastructure configuration.
149+
// It is used to signal to CCM deployment a configuration change that needs
150+
// a rollout update, and can be used by tests to also detect changes on CCM
151+
// deployment
152+
ccmConfigAnnotation = "operator.openshift.io/config-hash"
145153
)
146154

147155
func init() {
@@ -2600,71 +2608,81 @@ func TestIngressControllerCustomEndpoints(t *testing.T) {
26002608
if platform == nil {
26012609
t.Fatalf("platform status is missing for infrastructure %s", infraConfig.Name)
26022610
}
2603-
switch platform.Type {
2604-
case configv1.AWSPlatformType:
2605-
switch {
2606-
case platform.AWS == nil:
2607-
t.Fatalf("aws platform status is missing for infrastructure %s", infraConfig.Name)
2608-
case len(platform.AWS.ServiceEndpoints) != 0:
2609-
t.Skipf("custom endpoints detected for infrastructure %s, skipping TestIngressControllerCustomEndpoints",
2610-
infraConfig.Name)
2611-
case len(platform.AWS.Region) == 0:
2612-
t.Fatalf("region is missing from aws platform status for infrastructure %s", infraConfig.Name)
2613-
case platform.AWS.Region == endpoints.CnNorth1RegionID || platform.AWS.Region == endpoints.CnNorthwest1RegionID:
2614-
t.Skipf("region %s or %s detected for infrastructure %s, skipping TestIngressControllerCustomEndpoints",
2615-
endpoints.CnNorth1RegionID, endpoints.CnNorthwest1RegionID, infraConfig.Name)
2616-
}
2617-
route53Endpoint := configv1.AWSServiceEndpoint{
2618-
Name: "route53",
2619-
// AWS Route 53 is a non-regionalized service, so the endpoint URL
2620-
// does not include a region.
2621-
URL: "https://route53.amazonaws.com",
2622-
}
2623-
taggingEndpoint := configv1.AWSServiceEndpoint{
2624-
Name: "tagging",
2625-
// us-east-1 region is required to get hosted zone resources
2626-
// since route 53 is a non-regionalized service.
2627-
URL: "https://tagging.us-east-1.amazonaws.com",
2628-
}
2629-
elbEndpoint := configv1.AWSServiceEndpoint{
2630-
Name: "elasticloadbalancing",
2631-
URL: fmt.Sprintf("https://elasticloadbalancing.%s.amazonaws.com", platform.AWS.Region),
2632-
}
2633-
if err := updateInfrastructureConfigSpecWithRetryOnConflict(t, types.NamespacedName{Name: "cluster"}, timeout, func(spec *configv1.InfrastructureSpec) {
2634-
spec.PlatformSpec.AWS = &configv1.AWSPlatformSpec{
2635-
ServiceEndpoints: []configv1.AWSServiceEndpoint{
2636-
route53Endpoint,
2637-
taggingEndpoint,
2638-
elbEndpoint,
2639-
},
2640-
}
2641-
}); err != nil {
2642-
t.Fatalf("failed to update infrastructure config: %v\n", err)
2611+
2612+
if platform.Type != configv1.AWSPlatformType {
2613+
t.Skipf("skipping TestIngressControllerCustomEndpoints test due to platform type: %s", platform.Type)
2614+
}
2615+
2616+
switch {
2617+
case platform.AWS == nil:
2618+
t.Fatalf("aws platform status is missing for infrastructure %s", infraConfig.Name)
2619+
case len(platform.AWS.ServiceEndpoints) != 0:
2620+
t.Skipf("custom endpoints detected for infrastructure %s, skipping TestIngressControllerCustomEndpoints",
2621+
infraConfig.Name)
2622+
case len(platform.AWS.Region) == 0:
2623+
t.Fatalf("region is missing from aws platform status for infrastructure %s", infraConfig.Name)
2624+
case platform.AWS.Region == endpoints.CnNorth1RegionID || platform.AWS.Region == endpoints.CnNorthwest1RegionID:
2625+
t.Skipf("region %s or %s detected for infrastructure %s, skipping TestIngressControllerCustomEndpoints",
2626+
endpoints.CnNorth1RegionID, endpoints.CnNorthwest1RegionID, infraConfig.Name)
2627+
}
2628+
2629+
route53Endpoint := configv1.AWSServiceEndpoint{
2630+
Name: "route53",
2631+
// AWS Route 53 is a non-regionalized service, so the endpoint URL
2632+
// does not include a region.
2633+
URL: "https://route53.amazonaws.com",
2634+
}
2635+
taggingEndpoint := configv1.AWSServiceEndpoint{
2636+
Name: "tagging",
2637+
// us-east-1 region is required to get hosted zone resources
2638+
// since route 53 is a non-regionalized service.
2639+
URL: "https://tagging.us-east-1.amazonaws.com",
2640+
}
2641+
elbEndpoint := configv1.AWSServiceEndpoint{
2642+
Name: "elasticloadbalancing",
2643+
URL: fmt.Sprintf("https://elasticloadbalancing.%s.amazonaws.com", platform.AWS.Region),
2644+
}
2645+
2646+
// On a non-managed environment, retrieve the current Cloud Controller Manager
2647+
// configuration hash, so later we can verify if the new configuration was applied
2648+
// as part of the infrastructure.config update
2649+
var oldConfigHash string
2650+
if infraConfig.Status.ControlPlaneTopology != configv1.ExternalTopologyMode {
2651+
oldCCMDeployment := waitForCCMAvailableAndUpdated(t, 5*time.Minute, "")
2652+
oldConfigHash = retrieveCCMConfigurationHash(oldCCMDeployment)
2653+
if oldConfigHash == "" {
2654+
t.Fatalf("CCM does not have a config hash, this should never happen: %v", oldCCMDeployment)
2655+
}
2656+
}
2657+
2658+
// Wait for infrastructure status to update with custom endpoints.
2659+
if err := updateAndVerifyInfrastructureConfigWithRetry(t, types.NamespacedName{Name: "cluster"}, timeout, func(spec *configv1.InfrastructureSpec) {
2660+
spec.PlatformSpec.AWS = &configv1.AWSPlatformSpec{
2661+
ServiceEndpoints: []configv1.AWSServiceEndpoint{
2662+
route53Endpoint,
2663+
taggingEndpoint,
2664+
elbEndpoint,
2665+
},
26432666
}
2644-
defer func() {
2645-
// Remove the custom endpoints from the infrastructure config.
2646-
if err := updateInfrastructureConfigSpecWithRetryOnConflict(t, types.NamespacedName{Name: "cluster"}, timeout, func(spec *configv1.InfrastructureSpec) {
2647-
spec.PlatformSpec.AWS = nil
2648-
}); err != nil {
2649-
t.Fatalf("failed to update infrastructure config: %v", err)
2650-
}
2651-
}()
2652-
// Wait for infrastructure status to update with custom endpoints.
2653-
if err := wait.PollImmediate(1*time.Second, 15*time.Second, func() (bool, error) {
2654-
if err := kclient.Get(context.TODO(), types.NamespacedName{Name: "cluster"}, &infraConfig); err != nil {
2655-
t.Logf("failed to get infrastructure config: %v\n", err)
2656-
return false, err
2657-
}
2658-
if len(infraConfig.Status.PlatformStatus.AWS.ServiceEndpoints) == 0 {
2659-
return false, nil
2660-
}
2661-
return true, nil
2667+
}); err != nil {
2668+
t.Fatalf("failed to update infrastructure config: %v", err)
2669+
}
2670+
defer func() {
2671+
// Remove the custom endpoints from the infrastructure config.
2672+
if err := updateAndVerifyInfrastructureConfigWithRetry(t, types.NamespacedName{Name: "cluster"}, timeout, func(spec *configv1.InfrastructureSpec) {
2673+
spec.PlatformSpec.AWS = nil
26622674
}); err != nil {
2663-
t.Fatalf("failed to observe status update for infrastructure config %s", infraConfig.Name)
2675+
t.Fatalf("failed to update infrastructure config: %v", err)
26642676
}
2665-
default:
2666-
t.Skipf("skipping TestIngressControllerCustomEndpoints test due to platform type: %s", platform.Type)
2677+
}()
2678+
2679+
if infraConfig.Status.ControlPlaneTopology != configv1.ExternalTopologyMode {
2680+
// Wait for CCM to be ready with the new configuration, so we can guarantee that
2681+
// the LoadBalancer will be created correctly. This assertion is not possible
2682+
// on managed control planes (eg.: hypershift or rosa)
2683+
_ = waitForCCMAvailableAndUpdated(t, 5*time.Minute, oldConfigHash)
26672684
}
2685+
26682686
// The default ingresscontroller should surface the expected status conditions.
26692687
if err := waitForIngressControllerCondition(t, kclient, 30*time.Second, defaultName, availableConditionsForIngressControllerWithLoadBalancer...); err != nil {
26702688
t.Fatalf("did not get expected ingress controller conditions: %v", err)
@@ -2677,7 +2695,10 @@ func TestIngressControllerCustomEndpoints(t *testing.T) {
26772695
}
26782696
defer assertIngressControllerDeleted(t, kclient, ic)
26792697
// Ensure the ingress controller is reporting expected status conditions.
2680-
if err := waitForIngressControllerCondition(t, kclient, 5*time.Minute, name, availableConditionsForIngressControllerWithLoadBalancer...); err != nil {
2698+
// The readiness of the resource depends on the load of the environment, and because changing InfrastructureConfig
2699+
// causes the Cloud controller manager to rollout, at least 3 minutes are required for leader
2700+
// election of CCM to happen + the required time to properly schedule the Pods once they are killed/restarted
2701+
if err := waitForIngressControllerCondition(t, kclient, 7*time.Minute, name, availableConditionsForIngressControllerWithLoadBalancer...); err != nil {
26812702
t.Errorf("failed to observe expected conditions: %v", err)
26822703
}
26832704
}

test/e2e/util_test.go

Lines changed: 88 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"fmt"
1212
"net"
1313
"net/http"
14+
"slices"
1415
"strings"
1516
"testing"
1617
"time"
@@ -622,31 +623,111 @@ func updateIngressConfigSpecWithRetryOnConflict(t *testing.T, name types.Namespa
622623
})
623624
}
624625

625-
// updateInfrastructureConfigSpecWithRetryOnConflict gets a fresh copy
626+
// updateAndVerifyInfrastructureConfigWithRetry gets a fresh copy
626627
// of the named infrastructure object, calls updateSpecFn() where
627628
// callers can modify fields of the spec, and then updates the infrastructure
628629
// config object. If there is a conflict error on update then the
629630
// complete operation of get, mutate, and update is retried until
630631
// timeout is reached.
631-
func updateInfrastructureConfigSpecWithRetryOnConflict(t *testing.T, name types.NamespacedName, timeout time.Duration, mutateSpecFn func(*configv1.InfrastructureSpec)) error {
632+
// After updating, it will check if the desired spec.Endpoints where applied to
633+
// the infrastructure config
634+
func updateAndVerifyInfrastructureConfigWithRetry(t *testing.T, name types.NamespacedName, timeout time.Duration, mutateSpecFn func(*configv1.InfrastructureSpec)) error {
632635
infraConfig := configv1.Infrastructure{}
633-
return wait.PollImmediate(1*time.Second, timeout, func() (bool, error) {
634-
if err := kclient.Get(context.TODO(), name, &infraConfig); err != nil {
636+
// First we apply the configuration
637+
err := wait.PollUntilContextTimeout(context.TODO(), 1*time.Second, timeout, false, func(ctx context.Context) (done bool, err error) {
638+
if err := kclient.Get(ctx, name, &infraConfig); err != nil {
635639
t.Logf("error getting infrastructure config %v: %v, retrying...", name, err)
636640
return false, nil
637641
}
638642
mutateSpecFn(&infraConfig.Spec)
639643
if err := kclient.Update(context.TODO(), &infraConfig); err != nil {
640-
if errors.IsConflict(err) {
641-
t.Logf("conflict when updating infrastructure config %v: %v, retrying...", name, err)
644+
t.Logf("error updating infrastructure config %v: %v, retrying...", name, err)
645+
return false, nil
646+
}
647+
return true, nil
648+
})
649+
if err != nil {
650+
return err
651+
}
652+
// Then we get the applied configuration, and check if the serviceendpoints match
653+
return wait.PollUntilContextTimeout(context.TODO(), 1*time.Second, timeout, false, func(ctx context.Context) (done bool, err error) {
654+
if err := kclient.Get(ctx, name, &infraConfig); err != nil {
655+
t.Logf("error getting infrastructure config %v: %v, retrying...", name, err)
656+
return false, nil
657+
}
658+
659+
// Early return if we don't have any specific service endpoint
660+
if infraConfig.Spec.PlatformSpec.AWS == nil || len(infraConfig.Spec.PlatformSpec.AWS.ServiceEndpoints) == 0 {
661+
return true, nil
662+
}
663+
// In case we have a desired ServiceEndpoints, we need to check once it has been reflected correctly
664+
if infraConfig.Status.PlatformStatus == nil ||
665+
infraConfig.Status.PlatformStatus.AWS == nil ||
666+
len(infraConfig.Status.PlatformStatus.AWS.ServiceEndpoints) != len(infraConfig.Spec.PlatformSpec.AWS.ServiceEndpoints) {
667+
t.Logf("infrastructure config status does not contain endpoints yet %v: %v, retrying...", name, infraConfig.Status)
668+
return false, nil
669+
}
670+
671+
for _, endpoint := range infraConfig.Spec.PlatformSpec.AWS.ServiceEndpoints {
672+
if !slices.ContainsFunc(infraConfig.Status.PlatformStatus.AWS.ServiceEndpoints, func(e configv1.AWSServiceEndpoint) bool {
673+
return e.Name == endpoint.Name && e.URL == endpoint.URL
674+
}) {
675+
t.Logf("infrastructure config status does not contain the desired endpoints, retrying %v: %v, retrying...", name, endpoint)
642676
return false, nil
643677
}
644-
return false, err
645678
}
646679
return true, nil
647680
})
648681
}
649682

683+
// waitForCCMAvailableAndUpdated waits for Cloud Controller Manager Deployment
684+
// to be reported as ready.
685+
// If oldConfigurationHash is not empty it will additionally verify that the configuration hash
686+
// is different between the deployments
687+
func waitForCCMAvailableAndUpdated(t *testing.T, timeout time.Duration, oldConfigurationHash string) appsv1.Deployment {
688+
// The pooler below will never return an error, and instead it will log and try until it fails.
689+
// This avoids errors that are network errors to cause a failure on the test, and instead retries until it
690+
// is successful
691+
ccmDeployment := appsv1.Deployment{}
692+
if err := wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, timeout, false, func(ctx context.Context) (done bool, err error) {
693+
if err := kclient.Get(context.TODO(), ccmDeploymentName, &ccmDeployment); err != nil {
694+
t.Logf("error getting CCM deployment: %v", err)
695+
return false, nil
696+
}
697+
698+
if oldConfigurationHash != "" {
699+
currentConfigHash := retrieveCCMConfigurationHash(ccmDeployment)
700+
if oldConfigurationHash == currentConfigHash {
701+
t.Log("new config hash not yet applied to CCM deployment")
702+
return false, nil
703+
}
704+
}
705+
706+
if ccmDeployment.Generation != ccmDeployment.Status.ObservedGeneration ||
707+
ccmDeployment.Status.Replicas != ccmDeployment.Status.ReadyReplicas ||
708+
ccmDeployment.Status.Replicas != ccmDeployment.Status.UpdatedReplicas ||
709+
ccmDeployment.Status.UnavailableReplicas != 0 {
710+
t.Log("CCM replicas are not ready, will retry")
711+
return false, nil
712+
}
713+
714+
return true, nil
715+
716+
}); err != nil {
717+
t.Fatalf("failed to observe status update for infrastructure config %s", infraConfig.Name)
718+
}
719+
720+
return ccmDeployment
721+
}
722+
723+
// retrieveCCMConfigurationHash fetches the config-hash used on CCM deployment
724+
// and present on the annotation operator.openshift.io/config-hash
725+
// this information can be used to signal that a required infrastructure change
726+
// was successfully applied on CCM and forces CCM to do a rollout update.
727+
func retrieveCCMConfigurationHash(ccmDeployment appsv1.Deployment) string {
728+
return ccmDeployment.Spec.Template.Annotations[ccmConfigAnnotation]
729+
}
730+
650731
// updateInfrastructureStatus updates the Infrastructure status by applying
651732
// the given update function to the current Infrastructure object.
652733
// If there is a conflict error on update then the complete operation

0 commit comments

Comments
 (0)