Skip to content

Commit c98fc1f

Browse files
authored
Merge pull request kubernetes#88053 from liggitt/gc-timeout
Add buffer for GC resync retry to GC e2e tests
2 parents b4db964 + 242e3eb commit c98fc1f

File tree

1 file changed

+16
-6
lines changed

1 file changed

+16
-6
lines changed

test/e2e/apimachinery/garbage_collector.go

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,16 @@ var (
9898
zero = int64(0)
9999
lablecount = int64(0)
100100

101+
// The GC controller periodically rediscovers available APIs and syncs running informers for those resources.
102+
// If previously available APIs are removed during that resync process, the sync process can fail and need to be retried.
103+
//
104+
// During e2e runs, parallel tests add/remove API resources (by creating/deleting CRDs and aggregated APIs),
105+
// which makes it likely GC will need to retry informer resync at some point during an e2e run.
106+
//
107+
// This timeout covers two resync/retry periods, and should be added to wait timeouts to account for delays
108+
// to the GC controller caused by API changes in other tests.
109+
gcInformerResyncRetryTimeout = time.Minute
110+
101111
// CronJobGroupVersionResource unambiguously identifies a CronJob resource.
102112
CronJobGroupVersionResource = schema.GroupVersionResource{Group: batchv1beta1.GroupName, Version: "v1beta1", Resource: "cronjobs"}
103113
)
@@ -351,7 +361,7 @@ var _ = SIGDescribe("Garbage collector", func() {
351361
}
352362
ginkgo.By("wait for all pods to be garbage collected")
353363
// wait for the RCs and Pods to reach the expected numbers.
354-
if err := wait.Poll(5*time.Second, 60*time.Second, func() (bool, error) {
364+
if err := wait.Poll(5*time.Second, (60*time.Second)+gcInformerResyncRetryTimeout, func() (bool, error) {
355365
objects := map[string]int{"ReplicationControllers": 0, "Pods": 0}
356366
return verifyRemainingObjects(f, objects)
357367
}); err != nil {
@@ -411,7 +421,7 @@ var _ = SIGDescribe("Garbage collector", func() {
411421
// actual qps is less than 5. Also, the e2e tests are running in
412422
// parallel, the GC controller might get distracted by other tests.
413423
// According to the test logs, 120s is enough time.
414-
if err := wait.Poll(5*time.Second, 120*time.Second, func() (bool, error) {
424+
if err := wait.Poll(5*time.Second, 120*time.Second+gcInformerResyncRetryTimeout, func() (bool, error) {
415425
rcs, err := rcClient.List(context.TODO(), metav1.ListOptions{})
416426
if err != nil {
417427
return false, fmt.Errorf("failed to list rcs: %v", err)
@@ -518,7 +528,7 @@ var _ = SIGDescribe("Garbage collector", func() {
518528
framework.Failf("failed to delete the deployment: %v", err)
519529
}
520530
ginkgo.By("wait for all rs to be garbage collected")
521-
err = wait.PollImmediate(500*time.Millisecond, 1*time.Minute, func() (bool, error) {
531+
err = wait.PollImmediate(500*time.Millisecond, 1*time.Minute+gcInformerResyncRetryTimeout, func() (bool, error) {
522532
objects := map[string]int{"Deployments": 0, "ReplicaSets": 0, "Pods": 0}
523533
return verifyRemainingObjects(f, objects)
524534
})
@@ -577,7 +587,7 @@ var _ = SIGDescribe("Garbage collector", func() {
577587
framework.Failf("failed to delete the deployment: %v", err)
578588
}
579589
ginkgo.By("wait for deployment deletion to see if the garbage collector mistakenly deletes the rs")
580-
err = wait.PollImmediate(500*time.Millisecond, 1*time.Minute, func() (bool, error) {
590+
err = wait.PollImmediate(500*time.Millisecond, 1*time.Minute+gcInformerResyncRetryTimeout, func() (bool, error) {
581591
dList, err := deployClient.List(context.TODO(), metav1.ListOptions{})
582592
if err != nil {
583593
return false, fmt.Errorf("failed to list deployments: %v", err)
@@ -665,7 +675,7 @@ var _ = SIGDescribe("Garbage collector", func() {
665675
// owner deletion, but in practice there can be a long delay between owner
666676
// deletion and dependent deletion processing. For now, increase the timeout
667677
// and investigate the processing delay.
668-
if err := wait.Poll(1*time.Second, 60*time.Second, func() (bool, error) {
678+
if err := wait.Poll(1*time.Second, 30*time.Second+gcInformerResyncRetryTimeout, func() (bool, error) {
669679
_, err := rcClient.Get(context.TODO(), rc.Name, metav1.GetOptions{})
670680
if err == nil {
671681
pods, _ := podClient.List(context.TODO(), metav1.ListOptions{})
@@ -864,7 +874,7 @@ var _ = SIGDescribe("Garbage collector", func() {
864874
var err2 error
865875
// TODO: shorten the timeout when we make GC's periodic API rediscovery more efficient.
866876
// Tracked at https://github.com/kubernetes/kubernetes/issues/50046.
867-
if err := wait.Poll(5*time.Second, 90*time.Second, func() (bool, error) {
877+
if err := wait.Poll(5*time.Second, 90*time.Second+gcInformerResyncRetryTimeout, func() (bool, error) {
868878
pods, err2 = podClient.List(context.TODO(), metav1.ListOptions{})
869879
if err2 != nil {
870880
return false, fmt.Errorf("failed to list pods: %v", err)

0 commit comments

Comments
 (0)