Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion pkg/operator/controller/gateway-service-dns/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,11 @@ func NewUnmanaged(mgr manager.Manager, config Config) (controller.Controller, er
// A DNSRecord CR needs to be updated if, and only if,
// the hostname has changed (a listener's port and
// protocol have no bearing on the DNS record).
return gatewayListenersHostnamesChanged(old, new)
changed := gatewayListenersHostnamesChanged(old, new)
if changed {
log.Info("Listener hostname changed", "gateway", e.ObjectNew.(*gatewayapiv1.Gateway).Name)
}
return changed
},
}
isInOperandNamespace := predicate.NewPredicateFuncs(func(o client.Object) bool {
Expand All @@ -103,6 +107,7 @@ func NewUnmanaged(mgr manager.Manager, config Config) (controller.Controller, er
},
}
requests = append(requests, request)
log.Info("Enqueuing service for gateway", "gateway", o.GetName(), "service", request.NamespacedName)
}
return requests
}
Expand Down
6 changes: 5 additions & 1 deletion pkg/resources/dnsrecord/dns.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,17 @@ func EnsureDNSRecord(client client.Client, name types.NamespacedName, dnsRecordL
if err := client.Create(context.TODO(), desired); err != nil {
return false, nil, fmt.Errorf("failed to create dnsrecord %s/%s: %v", desired.Namespace, desired.Name, err)
}
log.Info("created dnsrecord", "dnsrecord", desired)
log.Info("Created dnsrecord", "dnsrecord", desired)
return CurrentDNSRecord(client, name)
case wantWC && haveWC:
if updated, err := updateDNSRecord(client, current, desired); err != nil {
return true, current, fmt.Errorf("failed to update dnsrecord %s/%s: %v", desired.Namespace, desired.Name, err)
} else if updated {
log.Info("Updated dnsrecord", "dnsrecord", desired)
return CurrentDNSRecord(client, name)
}
}
log.Info("No desired dnsrecord", "service", service)

return haveWC, current, nil
}
Expand Down Expand Up @@ -130,6 +132,7 @@ func desiredWildcardDNSRecord(name types.NamespacedName, dnsRecordLabels map[str
func desiredDNSRecord(name types.NamespacedName, dnsRecordLabels map[string]string, ownerRef metav1.OwnerReference, domain string, dnsPolicy iov1.DNSManagementPolicy, service *corev1.Service) (bool, *iov1.DNSRecord) {
// No LB target exists for the domain record to point at.
if len(service.Status.LoadBalancer.Ingress) == 0 {
log.Info("No load balancer target for dnsrecord", "dnsrecord", name, "service", service.Name)
return false, nil
}

Expand All @@ -138,6 +141,7 @@ func desiredDNSRecord(name types.NamespacedName, dnsRecordLabels map[string]stri
// Quick sanity check since we don't know how to handle both being set (is
// that even a valid state?)
if len(ingress.Hostname) > 0 && len(ingress.IP) > 0 {
log.Error(nil, "Both load balancer hostname and IP are set", "dnsrecord", name, "service", service.Name)
return false, nil
}

Expand Down
63 changes: 49 additions & 14 deletions test/e2e/util_gatewayapi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -594,10 +594,11 @@ func assertGatewayClassSuccessful(t *testing.T, name string) (*gatewayapiv1.Gate
return false, nil
})
if err != nil {
t.Logf("[%s] Last observed gatewayclass:\n%s", time.Now().Format(time.DateTime), util.ToYaml(gwc))
return nil, fmt.Errorf("gatewayclass %s is not %v; last recorded status message: %s", name, gatewayapiv1.GatewayClassConditionStatusAccepted, recordedConditionMsg)
}

t.Logf("Observed that gatewayclass %s has been accepted: %+v", name, gwc.Status)
t.Logf("[%s] Observed that gatewayclass %s has been accepted: %+v", time.Now().Format(time.DateTime), name, gwc.Status)

return gwc, nil
}
Expand All @@ -609,30 +610,58 @@ func assertGatewaySuccessful(t *testing.T, namespace, name string) (*gatewayapiv

gw := &gatewayapiv1.Gateway{}
nsName := types.NamespacedName{Namespace: namespace, Name: name}
recordedConditionMsg := "not found"
recordedAcceptedConditionMsg, recordedProgrammedConditionMsg := "", ""

err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, 1*time.Minute, false, func(context context.Context) (bool, error) {
// Wait for the gateway to be accepted and programmed.
// Load balancer provisioning can take several minutes on some platforms.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: it would be great in the future to set timeouts that vary based on platforms.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there an advantage to using a lower timeout on platforms that don't need the higher timeout?

I would prefer to take the longest amount of time we expect provisioning to take, and double it. We don't pay the cost unless provisioning actually takes that amount of time, but we do pay a cost if the test times out when waiting longer would have allowed it to pass.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We pay the cost of higher timeouts when everything is failing. I'm suggesting that we be more choosy with expected timeouts to take advantage of tests failing faster on platforms that don't take as long to provision. It's just a nit though, so no big deal.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm suggesting that we be more choosy with expected timeouts to take advantage of tests failing faster on platforms that don't take as long to provision. It's just a nit though, so no big deal.

My concern with "tests failing faster on platforms that don't take as long to provision" is that it could lead to premature failures. For instance, if LB provisioning on GCP typically takes 1 minute, but occasionally takes 2 minutes, should we fail the test after 1 minute of polling (and then follow up with the CCM team)? Or should we allow it to poll for an additional minute to potentially avoid a retest of the entire suite?

I'd say that the cost of retesting flaky tests outweighs the cost of a slightly longer poll in a failed scenario, especially since the LB provisioning is a process outside of our direct control.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We pay the cost of higher timeouts when everything is failing.

Can we test for "everything is failing" in the polling loop, and exit early if the condition is detected? That would fail faster, flake less, and provide useful diagnostic information right in the test output.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but let's prioritize the merge so we can get better CI signal.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but let's prioritize the merge so we can get better CI signal.

Agreed. The PR has already merged, but in general when we have tests that are flaky because polling loops time out, I would argue for (1) increasing timeouts as a first pass in order to reduce flakiness and (2) refining the failure conditions as a second pass in order to fail faster and improve diagnostics. Optimizing to reduce flakiness virtually always takes precedence over optimizing to fail fast.

// Therefore, a timeout of 3 minutes is set to accommodate potential delays.
err := wait.PollUntilContextTimeout(context.Background(), 3*time.Second, 3*time.Minute, false, func(context context.Context) (bool, error) {
if err := kclient.Get(context, nsName, gw); err != nil {
t.Logf("Failed to get gateway %v: %v; retrying...", nsName, err)
return false, nil
}
acceptedConditionFound, programmedConditionFound := false, false
for _, condition := range gw.Status.Conditions {
if condition.Type == string(gatewayapiv1.GatewayConditionAccepted) {
recordedConditionMsg = condition.Message
recordedAcceptedConditionMsg = condition.Message
if condition.Status == metav1.ConditionTrue {
t.Logf("Found gateway %v as Accepted", nsName)
return true, nil
t.Logf("[%s] Found gateway %v as Accepted", time.Now().Format(time.DateTime), nsName)
acceptedConditionFound = true
}
}
// Ensuring the gateway configuration is ready.
// `AddressNotAssigned` may happen if the gateway service
// didn't get its load balancer target.
if condition.Type == string(gatewayapiv1.GatewayConditionProgrammed) {
recordedProgrammedConditionMsg = condition.Message
if condition.Status == metav1.ConditionTrue {
t.Logf("[%s] Found gateway %v as Programmed", time.Now().Format(time.DateTime), nsName)
programmedConditionFound = true
}
}
}
if acceptedConditionFound && programmedConditionFound {
return true, nil
}
t.Logf("[%s] Not all expected gateway conditions are found, checking gateway service...", time.Now().Format(time.DateTime))

// The creation of the gateway service may be delayed.
// Check the current status of the service to see where we are.
svc := &corev1.Service{}
svcNsName := types.NamespacedName{Namespace: namespace, Name: gw.Name + "-" + string(gw.Spec.GatewayClassName)}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer to avoid reusing the code which is being tested in tests. Composing a name like this adds an additional check for an implicit assumption about the naming of the publishing service.

if err := kclient.Get(context, svcNsName, svc); err != nil {
t.Logf("Failed to get gateway service %v: %v; retrying...", svcNsName, err)
return false, nil
}
t.Logf("[%s] Found gateway service: %+v", time.Now().Format(time.DateTime), svc)
return false, nil
})
if err != nil {
t.Logf("Last observed gateway:\n%s", util.ToYaml(gw))
return nil, fmt.Errorf("gateway %v not %v, last recorded status message: %s", nsName, gatewayapiv1.GatewayConditionAccepted, recordedConditionMsg)
t.Logf("[%s] Last observed gateway:\n%s", time.Now().Format(time.DateTime), util.ToYaml(gw))
return nil, fmt.Errorf("gateway %v does not have all expected conditions, last recorded status messages status messages for Accepted: %q, Programmed: %q", nsName, recordedAcceptedConditionMsg, recordedProgrammedConditionMsg)
}

t.Logf("Observed that gateway %v has been accepted: %+v", nsName, gw.Status)
t.Logf("[%s] Observed that gateway %v has been accepted: %+v", time.Now().Format(time.DateTime), nsName, gw.Status)

return gw, nil
}
Expand Down Expand Up @@ -1037,21 +1066,27 @@ func assertDNSRecord(t *testing.T, recordName types.NamespacedName) error {

err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 10*time.Minute, false, func(context context.Context) (bool, error) {
if err := kclient.Get(context, recordName, dnsRecord); err != nil {
t.Logf("Failed to get DNSRecord %v: %v; retrying...", recordName, err)
t.Logf("[%s] Failed to get DNSRecord %v: %v; retrying...", time.Now().Format(time.DateTime), recordName, err)
return false, nil
}
// Determine the current state of the DNSRecord.
reason := "missing published condition"
if len(dnsRecord.Status.Zones) > 0 {
for _, zone := range dnsRecord.Status.Zones {
for _, condition := range zone.Conditions {
if condition.Type == v1.DNSRecordPublishedConditionType && condition.Status == string(metav1.ConditionTrue) {
t.Logf("Found DNSRecord %v %s=%s", recordName, condition.Type, condition.Status)
return true, nil
if condition.Type == v1.DNSRecordPublishedConditionType {
reason = "unexpected published condition value"
if condition.Status == string(metav1.ConditionTrue) {
t.Logf("[%s] Found DNSRecord %v %s=%s", time.Now().Format(time.DateTime), recordName, condition.Type, condition.Status)
return true, nil
}
}
}
}
} else {
reason = "missing zones"
}
t.Logf("Found DNSRecord %v but could not determine its readiness; retrying...", recordName)
t.Logf("[%s] Found DNSRecord %v but could not determine its readiness due to %s; retrying...", time.Now().Format(time.DateTime), recordName, reason)
return false, nil
})
return err
Expand Down