Skip to content

Commit 51ad0bb

Browse files
authored
Merge pull request kubernetes#124795 from atiratree/fix-daemon-max-surge-flake
e2e: DaemonSet maxSurge test should account for terminated pods that are terminated by the test
2 parents c7c4039 + d70c3f7 commit 51ad0bb

File tree

1 file changed

+26
-11
lines changed

1 file changed

+26
-11
lines changed

test/e2e/apps/daemon_set.go

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -585,10 +585,12 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() {
585585
nodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
586586
framework.ExpectNoError(err)
587587
nodeCount := len(nodes.Items)
588-
retryTimeout := dsRetryTimeout + time.Duration(nodeCount*30)*time.Second
588+
// We disturb daemonset progress by randomly terminating pods.
589+
randomPodTerminationTimeout := 5 * time.Minute
590+
retryTimeout := dsRetryTimeout + randomPodTerminationTimeout + time.Duration(nodeCount*30)*time.Second
589591

590592
ginkgo.By("Check that daemon pods surge and invariants are preserved during that rollout")
591-
ageOfOldPod := make(map[string]time.Time)
593+
nodeToAgeOfOldPod := make(map[string]map[string]time.Time)
592594
deliberatelyDeletedPods := sets.NewString()
593595
err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, retryTimeout, true, func(ctx context.Context) (bool, error) {
594596
podList, err := c.CoreV1().Pods(ds.Namespace).List(ctx, metav1.ListOptions{})
@@ -682,17 +684,25 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() {
682684
// if this is a pod in an older version AND there is a new version of this pod, record when
683685
// we started seeing this, otherwise delete the record (perhaps the node was drained)
684686
if nodesToVersions[pod.Spec.NodeName][newVersion] > 0 {
685-
if _, ok := ageOfOldPod[string(pod.UID)]; !ok {
686-
ageOfOldPod[string(pod.UID)] = now
687+
if _, ok := nodeToAgeOfOldPod[pod.Spec.NodeName][string(pod.UID)]; !ok {
688+
if _, ok := nodeToAgeOfOldPod[pod.Spec.NodeName]; !ok {
689+
nodeToAgeOfOldPod[pod.Spec.NodeName] = make(map[string]time.Time)
690+
}
691+
nodeToAgeOfOldPod[pod.Spec.NodeName][string(pod.UID)] = now
687692
}
688693
} else {
689-
delete(ageOfOldPod, string(pod.UID))
694+
delete(nodeToAgeOfOldPod, pod.Spec.NodeName)
690695
}
691696
}
692697
// purge the old pods list of any deleted pods
693-
for uid := range ageOfOldPod {
694-
if !podUIDs.Has(uid) {
695-
delete(ageOfOldPod, uid)
698+
for node, uidToTime := range nodeToAgeOfOldPod {
699+
for uid := range uidToTime {
700+
if !podUIDs.Has(uid) {
701+
delete(uidToTime, uid)
702+
}
703+
}
704+
if len(uidToTime) == 0 {
705+
delete(nodeToAgeOfOldPod, node)
696706
}
697707
}
698708
deliberatelyDeletedPods = deliberatelyDeletedPods.Intersection(deletedPodUIDs)
@@ -713,9 +723,11 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() {
713723
}
714724

715725
// invariant: the controller must react to the new pod becoming ready within a reasonable timeframe (2x grace period)
716-
for uid, firstSeen := range ageOfOldPod {
717-
if now.Sub(firstSeen) > maxSurgeOverlap {
718-
errs = append(errs, fmt.Sprintf("An old pod with UID %s has been running alongside a newer version for longer than %s", uid, maxSurgeOverlap))
726+
for node, uidToTime := range nodeToAgeOfOldPod {
727+
for uid, firstSeenSinceNewVersionPod := range uidToTime {
728+
if now.Sub(firstSeenSinceNewVersionPod) > maxSurgeOverlap {
729+
errs = append(errs, fmt.Sprintf("An old pod with UID %s on a node %s has been running alongside a newer version for longer than %s", uid, node, maxSurgeOverlap))
730+
}
719731
}
720732
}
721733

@@ -800,6 +812,9 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() {
800812
} else {
801813
framework.Logf("Deleted pod %s prematurely", pod.Name)
802814
deliberatelyDeletedPods.Insert(string(pod.UID))
815+
// If it is an old version we do not need to measure the controller reaction because we have done it instead.
816+
// If it is a new version, we have to reset the time to start counting the time for the replacement pod to reach readiness again.
817+
delete(nodeToAgeOfOldPod, pod.Spec.NodeName)
803818
}
804819
}
805820
}

0 commit comments

Comments
 (0)