@@ -585,10 +585,12 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() {
585
585
nodes , err := c .CoreV1 ().Nodes ().List (ctx , metav1.ListOptions {})
586
586
framework .ExpectNoError (err )
587
587
nodeCount := len (nodes .Items )
588
- retryTimeout := dsRetryTimeout + time .Duration (nodeCount * 30 )* time .Second
588
+ // We disturb daemonset progress by randomly terminating pods.
589
+ randomPodTerminationTimeout := 5 * time .Minute
590
+ retryTimeout := dsRetryTimeout + randomPodTerminationTimeout + time .Duration (nodeCount * 30 )* time .Second
589
591
590
592
ginkgo .By ("Check that daemon pods surge and invariants are preserved during that rollout" )
591
- ageOfOldPod := make (map [string ]time.Time )
593
+ nodeToAgeOfOldPod := make (map [ string ] map [string ]time.Time )
592
594
deliberatelyDeletedPods := sets .NewString ()
593
595
err = wait .PollUntilContextTimeout (ctx , dsRetryPeriod , retryTimeout , true , func (ctx context.Context ) (bool , error ) {
594
596
podList , err := c .CoreV1 ().Pods (ds .Namespace ).List (ctx , metav1.ListOptions {})
@@ -682,17 +684,25 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() {
682
684
// if this is a pod in an older version AND there is a new version of this pod, record when
683
685
// we started seeing this, otherwise delete the record (perhaps the node was drained)
684
686
if nodesToVersions [pod .Spec .NodeName ][newVersion ] > 0 {
685
- if _ , ok := ageOfOldPod [string (pod .UID )]; ! ok {
686
- ageOfOldPod [string (pod .UID )] = now
687
+ if _ , ok := nodeToAgeOfOldPod [pod .Spec .NodeName ][string (pod .UID )]; ! ok {
688
+ if _ , ok := nodeToAgeOfOldPod [pod .Spec .NodeName ]; ! ok {
689
+ nodeToAgeOfOldPod [pod .Spec .NodeName ] = make (map [string ]time.Time )
690
+ }
691
+ nodeToAgeOfOldPod [pod .Spec .NodeName ][string (pod .UID )] = now
687
692
}
688
693
} else {
689
- delete (ageOfOldPod , string ( pod .UID ) )
694
+ delete (nodeToAgeOfOldPod , pod .Spec . NodeName )
690
695
}
691
696
}
692
697
// purge the old pods list of any deleted pods
693
- for uid := range ageOfOldPod {
694
- if ! podUIDs .Has (uid ) {
695
- delete (ageOfOldPod , uid )
698
+ for node , uidToTime := range nodeToAgeOfOldPod {
699
+ for uid := range uidToTime {
700
+ if ! podUIDs .Has (uid ) {
701
+ delete (uidToTime , uid )
702
+ }
703
+ }
704
+ if len (uidToTime ) == 0 {
705
+ delete (nodeToAgeOfOldPod , node )
696
706
}
697
707
}
698
708
deliberatelyDeletedPods = deliberatelyDeletedPods .Intersection (deletedPodUIDs )
@@ -713,9 +723,11 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() {
713
723
}
714
724
715
725
// invariant: the controller must react to the new pod becoming ready within a reasonable timeframe (2x grace period)
716
- for uid , firstSeen := range ageOfOldPod {
717
- if now .Sub (firstSeen ) > maxSurgeOverlap {
718
- errs = append (errs , fmt .Sprintf ("An old pod with UID %s has been running alongside a newer version for longer than %s" , uid , maxSurgeOverlap ))
726
+ for node , uidToTime := range nodeToAgeOfOldPod {
727
+ for uid , firstSeenSinceNewVersionPod := range uidToTime {
728
+ if now .Sub (firstSeenSinceNewVersionPod ) > maxSurgeOverlap {
729
+ errs = append (errs , fmt .Sprintf ("An old pod with UID %s on a node %s has been running alongside a newer version for longer than %s" , uid , node , maxSurgeOverlap ))
730
+ }
719
731
}
720
732
}
721
733
@@ -800,6 +812,9 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() {
800
812
} else {
801
813
framework .Logf ("Deleted pod %s prematurely" , pod .Name )
802
814
deliberatelyDeletedPods .Insert (string (pod .UID ))
815
+ // If it is an old version we do not need to measure the controller reaction because we have done it instead.
816
+ // If it is a new version, we have to reset the time to start counting the time for the replacement pod to reach readiness again.
817
+ delete (nodeToAgeOfOldPod , pod .Spec .NodeName )
803
818
}
804
819
}
805
820
}
0 commit comments