-
Notifications
You must be signed in to change notification settings - Fork 218
OCPBUGS-54966: Improve detection of missing DNSRecord for Gateway #1212
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -594,10 +594,11 @@ func assertGatewayClassSuccessful(t *testing.T, name string) (*gatewayapiv1.Gate | |
| return false, nil | ||
| }) | ||
| if err != nil { | ||
| t.Logf("[%s] Last observed gatewayclass:\n%s", time.Now().Format(time.DateTime), util.ToYaml(gwc)) | ||
| return nil, fmt.Errorf("gatewayclass %s is not %v; last recorded status message: %s", name, gatewayapiv1.GatewayClassConditionStatusAccepted, recordedConditionMsg) | ||
| } | ||
|
|
||
| t.Logf("Observed that gatewayclass %s has been accepted: %+v", name, gwc.Status) | ||
| t.Logf("[%s] Observed that gatewayclass %s has been accepted: %+v", time.Now().Format(time.DateTime), name, gwc.Status) | ||
|
|
||
| return gwc, nil | ||
| } | ||
|
|
@@ -609,30 +610,58 @@ func assertGatewaySuccessful(t *testing.T, namespace, name string) (*gatewayapiv | |
|
|
||
| gw := &gatewayapiv1.Gateway{} | ||
| nsName := types.NamespacedName{Namespace: namespace, Name: name} | ||
| recordedConditionMsg := "not found" | ||
| recordedAcceptedConditionMsg, recordedProgrammedConditionMsg := "", "" | ||
|
|
||
| err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, 1*time.Minute, false, func(context context.Context) (bool, error) { | ||
| // Wait for the gateway to be accepted and programmed. | ||
| // Load balancer provisioning can take several minutes on some platforms. | ||
| // Therefore, a timeout of 3 minutes is set to accommodate potential delays. | ||
| err := wait.PollUntilContextTimeout(context.Background(), 3*time.Second, 3*time.Minute, false, func(context context.Context) (bool, error) { | ||
| if err := kclient.Get(context, nsName, gw); err != nil { | ||
| t.Logf("Failed to get gateway %v: %v; retrying...", nsName, err) | ||
| return false, nil | ||
| } | ||
| acceptedConditionFound, programmedConditionFound := false, false | ||
| for _, condition := range gw.Status.Conditions { | ||
| if condition.Type == string(gatewayapiv1.GatewayConditionAccepted) { | ||
| recordedConditionMsg = condition.Message | ||
| recordedAcceptedConditionMsg = condition.Message | ||
| if condition.Status == metav1.ConditionTrue { | ||
| t.Logf("Found gateway %v as Accepted", nsName) | ||
| return true, nil | ||
| t.Logf("[%s] Found gateway %v as Accepted", time.Now().Format(time.DateTime), nsName) | ||
| acceptedConditionFound = true | ||
| } | ||
| } | ||
| // Ensuring the gateway configuration is ready. | ||
| // `AddressNotAssigned` may happen if the gateway service | ||
| // didn't get its load balancer target. | ||
| if condition.Type == string(gatewayapiv1.GatewayConditionProgrammed) { | ||
| recordedProgrammedConditionMsg = condition.Message | ||
| if condition.Status == metav1.ConditionTrue { | ||
| t.Logf("[%s] Found gateway %v as Programmed", time.Now().Format(time.DateTime), nsName) | ||
| programmedConditionFound = true | ||
| } | ||
| } | ||
| } | ||
| if acceptedConditionFound && programmedConditionFound { | ||
| return true, nil | ||
| } | ||
| t.Logf("[%s] Not all expected gateway conditions are found, checking gateway service...", time.Now().Format(time.DateTime)) | ||
|
|
||
| // The creation of the gateway service may be delayed. | ||
| // Check the current status of the service to see where we are. | ||
| svc := &corev1.Service{} | ||
| svcNsName := types.NamespacedName{Namespace: namespace, Name: gw.Name + "-" + string(gw.Spec.GatewayClassName)} | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: the generated name could be stored for other uses, like https://github.com/openshift/cluster-ingress-operator/blob/master/pkg/operator/controller/names.go#L319
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I prefer to avoid reusing the code which is being tested in tests. Composing a name like this adds an additional check for an implicit assumption about the naming of the publishing service. |
||
| if err := kclient.Get(context, svcNsName, svc); err != nil { | ||
| t.Logf("Failed to get gateway service %v: %v; retrying...", svcNsName, err) | ||
| return false, nil | ||
| } | ||
| t.Logf("[%s] Found gateway service: %+v", time.Now().Format(time.DateTime), svc) | ||
| return false, nil | ||
| }) | ||
| if err != nil { | ||
| t.Logf("Last observed gateway:\n%s", util.ToYaml(gw)) | ||
| return nil, fmt.Errorf("gateway %v not %v, last recorded status message: %s", nsName, gatewayapiv1.GatewayConditionAccepted, recordedConditionMsg) | ||
| t.Logf("[%s] Last observed gateway:\n%s", time.Now().Format(time.DateTime), util.ToYaml(gw)) | ||
| return nil, fmt.Errorf("gateway %v does not have all expected conditions, last recorded status messages status messages for Accepted: %q, Programmed: %q", nsName, recordedAcceptedConditionMsg, recordedProgrammedConditionMsg) | ||
| } | ||
|
|
||
| t.Logf("Observed that gateway %v has been accepted: %+v", nsName, gw.Status) | ||
| t.Logf("[%s] Observed that gateway %v has been accepted: %+v", time.Now().Format(time.DateTime), nsName, gw.Status) | ||
|
|
||
| return gw, nil | ||
| } | ||
|
|
@@ -1037,21 +1066,27 @@ func assertDNSRecord(t *testing.T, recordName types.NamespacedName) error { | |
|
|
||
| err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 10*time.Minute, false, func(context context.Context) (bool, error) { | ||
| if err := kclient.Get(context, recordName, dnsRecord); err != nil { | ||
| t.Logf("Failed to get DNSRecord %v: %v; retrying...", recordName, err) | ||
| t.Logf("[%s] Failed to get DNSRecord %v: %v; retrying...", time.Now().Format(time.DateTime), recordName, err) | ||
| return false, nil | ||
| } | ||
| // Determine the current state of the DNSRecord. | ||
| reason := "missing published condition" | ||
| if len(dnsRecord.Status.Zones) > 0 { | ||
| for _, zone := range dnsRecord.Status.Zones { | ||
| for _, condition := range zone.Conditions { | ||
| if condition.Type == v1.DNSRecordPublishedConditionType && condition.Status == string(metav1.ConditionTrue) { | ||
| t.Logf("Found DNSRecord %v %s=%s", recordName, condition.Type, condition.Status) | ||
| return true, nil | ||
| if condition.Type == v1.DNSRecordPublishedConditionType { | ||
| reason = "unexpected published condition value" | ||
| if condition.Status == string(metav1.ConditionTrue) { | ||
| t.Logf("[%s] Found DNSRecord %v %s=%s", time.Now().Format(time.DateTime), recordName, condition.Type, condition.Status) | ||
| return true, nil | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } else { | ||
| reason = "missing zones" | ||
| } | ||
| t.Logf("Found DNSRecord %v but could not determine its readiness; retrying...", recordName) | ||
| t.Logf("[%s] Found DNSRecord %v but could not determine its readiness due to %s; retrying...", time.Now().Format(time.DateTime), recordName, reason) | ||
| return false, nil | ||
| }) | ||
| return err | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: it would be great in the future to set timeouts that vary based on platforms.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there an advantage to using a lower timeout on platforms that don't need the higher timeout?
I would prefer to take the longest amount of time we expect provisioning to take, and double it. We don't pay the cost unless provisioning actually takes that amount of time, but we do pay a cost if the test times out when waiting longer would have allowed it to pass.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We pay the cost of higher timeouts when everything is failing. I'm suggesting that we be more choosy with expected timeouts to take advantage of tests failing faster on platforms that don't take as long to provision. It's just a nit though, so no big deal.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My concern with "tests failing faster on platforms that don't take as long to provision" is that it could lead to premature failures. For instance, if LB provisioning on GCP typically takes 1 minute, but occasionally takes 2 minutes, should we fail the test after 1 minute of polling (and then follow up with the CCM team)? Or should we allow it to poll for an additional minute to potentially avoid a retest of the entire suite?
I'd say that the cost of retesting flaky tests outweighs the cost of a slightly longer poll in a failed scenario, especially since the LB provisioning is a process outside of our direct control.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we test for "everything is failing" in the polling loop, and exit early if the condition is detected? That would fail faster, flake less, and provide useful diagnostic information right in the test output.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but let's prioritize the merge so we can get better CI signal.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed. The PR has already merged, but in general when we have tests that are flaky because polling loops time out, I would argue for (1) increasing timeouts as a first pass in order to reduce flakiness and (2) refining the failure conditions as a second pass in order to fail faster and improve diagnostics. Optimizing to reduce flakiness virtually always takes precedence over optimizing to fail fast.