|
98 | 98 | zero = int64(0)
|
99 | 99 | lablecount = int64(0)
|
100 | 100 |
|
| 101 | + // The GC controller periodically rediscovers available APIs and syncs running informers for those resources. |
| 102 | + // If previously available APIs are removed during that resync process, the sync process can fail and need to be retried. |
| 103 | + // |
| 104 | + // During e2e runs, parallel tests add/remove API resources (by creating/deleting CRDs and aggregated APIs), |
| 105 | + // which makes it likely GC will need to retry informer resync at some point during an e2e run. |
| 106 | + // |
| 107 | + // This timeout covers two resync/retry periods, and should be added to wait timeouts to account for delays |
| 108 | + // to the GC controller caused by API changes in other tests. |
| 109 | + gcInformerResyncRetryTimeout = time.Minute |
| 110 | + |
101 | 111 | // CronJobGroupVersionResource unambiguously identifies a CronJob resource.
|
102 | 112 | CronJobGroupVersionResource = schema.GroupVersionResource{Group: batchv1beta1.GroupName, Version: "v1beta1", Resource: "cronjobs"}
|
103 | 113 | )
|
@@ -351,7 +361,7 @@ var _ = SIGDescribe("Garbage collector", func() {
|
351 | 361 | }
|
352 | 362 | ginkgo.By("wait for all pods to be garbage collected")
|
353 | 363 | // wait for the RCs and Pods to reach the expected numbers.
|
354 |
| - if err := wait.Poll(5*time.Second, 60*time.Second, func() (bool, error) { |
| 364 | + if err := wait.Poll(5*time.Second, (60*time.Second)+gcInformerResyncRetryTimeout, func() (bool, error) { |
355 | 365 | objects := map[string]int{"ReplicationControllers": 0, "Pods": 0}
|
356 | 366 | return verifyRemainingObjects(f, objects)
|
357 | 367 | }); err != nil {
|
@@ -411,7 +421,7 @@ var _ = SIGDescribe("Garbage collector", func() {
|
411 | 421 | // actual qps is less than 5. Also, the e2e tests are running in
|
412 | 422 | // parallel, the GC controller might get distracted by other tests.
|
413 | 423 | // According to the test logs, 120s is enough time.
|
414 |
| - if err := wait.Poll(5*time.Second, 120*time.Second, func() (bool, error) { |
| 424 | + if err := wait.Poll(5*time.Second, 120*time.Second+gcInformerResyncRetryTimeout, func() (bool, error) { |
415 | 425 | rcs, err := rcClient.List(context.TODO(), metav1.ListOptions{})
|
416 | 426 | if err != nil {
|
417 | 427 | return false, fmt.Errorf("failed to list rcs: %v", err)
|
@@ -518,7 +528,7 @@ var _ = SIGDescribe("Garbage collector", func() {
|
518 | 528 | framework.Failf("failed to delete the deployment: %v", err)
|
519 | 529 | }
|
520 | 530 | ginkgo.By("wait for all rs to be garbage collected")
|
521 |
| - err = wait.PollImmediate(500*time.Millisecond, 1*time.Minute, func() (bool, error) { |
| 531 | + err = wait.PollImmediate(500*time.Millisecond, 1*time.Minute+gcInformerResyncRetryTimeout, func() (bool, error) { |
522 | 532 | objects := map[string]int{"Deployments": 0, "ReplicaSets": 0, "Pods": 0}
|
523 | 533 | return verifyRemainingObjects(f, objects)
|
524 | 534 | })
|
@@ -577,7 +587,7 @@ var _ = SIGDescribe("Garbage collector", func() {
|
577 | 587 | framework.Failf("failed to delete the deployment: %v", err)
|
578 | 588 | }
|
579 | 589 | ginkgo.By("wait for deployment deletion to see if the garbage collector mistakenly deletes the rs")
|
580 |
| - err = wait.PollImmediate(500*time.Millisecond, 1*time.Minute, func() (bool, error) { |
| 590 | + err = wait.PollImmediate(500*time.Millisecond, 1*time.Minute+gcInformerResyncRetryTimeout, func() (bool, error) { |
581 | 591 | dList, err := deployClient.List(context.TODO(), metav1.ListOptions{})
|
582 | 592 | if err != nil {
|
583 | 593 | return false, fmt.Errorf("failed to list deployments: %v", err)
|
@@ -665,7 +675,7 @@ var _ = SIGDescribe("Garbage collector", func() {
|
665 | 675 | // owner deletion, but in practice there can be a long delay between owner
|
666 | 676 | // deletion and dependent deletion processing. For now, increase the timeout
|
667 | 677 | // and investigate the processing delay.
|
668 |
| - if err := wait.Poll(1*time.Second, 60*time.Second, func() (bool, error) { |
| 678 | + if err := wait.Poll(1*time.Second, 30*time.Second+gcInformerResyncRetryTimeout, func() (bool, error) { |
669 | 679 | _, err := rcClient.Get(context.TODO(), rc.Name, metav1.GetOptions{})
|
670 | 680 | if err == nil {
|
671 | 681 | pods, _ := podClient.List(context.TODO(), metav1.ListOptions{})
|
@@ -864,7 +874,7 @@ var _ = SIGDescribe("Garbage collector", func() {
|
864 | 874 | var err2 error
|
865 | 875 | // TODO: shorten the timeout when we make GC's periodic API rediscovery more efficient.
|
866 | 876 | // Tracked at https://github.com/kubernetes/kubernetes/issues/50046.
|
867 |
| - if err := wait.Poll(5*time.Second, 90*time.Second, func() (bool, error) { |
| 877 | + if err := wait.Poll(5*time.Second, 90*time.Second+gcInformerResyncRetryTimeout, func() (bool, error) { |
868 | 878 | pods, err2 = podClient.List(context.TODO(), metav1.ListOptions{})
|
869 | 879 | if err2 != nil {
|
870 | 880 | return false, fmt.Errorf("failed to list pods: %v", err)
|
|
0 commit comments