Skip to content

Commit 9038648

Browse files
committed
Fix DaemonSet pods circling in Failed/Pending during disruption
1 parent 72387e3 commit 9038648

File tree

3 files changed

+105
-6
lines changed

3 files changed

+105
-6
lines changed

pkg/controllers/disruption/suite_test.go

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,7 +1386,7 @@ var _ = Describe("Candidate Filtering", func() {
13861386
Expect(err.Error()).To(Equal(fmt.Sprintf(`pdb prevents pod evictions (PodDisruptionBudget=[%s])`, client.ObjectKeyFromObject(budget))))
13871387
Expect(recorder.DetectedEvent(fmt.Sprintf(`Pdb prevents pod evictions (PodDisruptionBudget=[%s])`, client.ObjectKeyFromObject(budget)))).To(BeTrue())
13881388
})
1389-
It("should not consider candidates that have fully blocking PDBs on daemonset pods", func() {
1389+
It("should consider candidates that have fully blocking PDBs on daemonset pods since daemonset pods are not evicted", func() {
13901390
daemonSet := test.DaemonSet()
13911391
nodeClaim, node := test.NodeClaimAndNode(v1.NodeClaim{
13921392
ObjectMeta: metav1.ObjectMeta{
@@ -1428,10 +1428,55 @@ var _ = Describe("Candidate Filtering", func() {
14281428
Expect(err).ToNot(HaveOccurred())
14291429

14301430
Expect(cluster.DeepCopyNodes()).To(HaveLen(1))
1431-
_, err = disruption.NewCandidate(ctx, env.Client, recorder, fakeClock, cluster.DeepCopyNodes()[0], pdbLimits, nodePoolMap, nodePoolInstanceTypeMap, queue, disruption.GracefulDisruptionClass)
1432-
Expect(err).To(HaveOccurred())
1433-
Expect(err.Error()).To(Equal(fmt.Sprintf(`pdb prevents pod evictions (PodDisruptionBudget=[%s])`, client.ObjectKeyFromObject(budget))))
1434-
Expect(recorder.DetectedEvent(fmt.Sprintf(`Pdb prevents pod evictions (PodDisruptionBudget=[%s])`, client.ObjectKeyFromObject(budget)))).To(BeTrue())
1431+
// DaemonSet pods are not evicted during disruption, so PDB does not block the node from being a candidate
1432+
c, err := disruption.NewCandidate(ctx, env.Client, recorder, fakeClock, cluster.DeepCopyNodes()[0], pdbLimits, nodePoolMap, nodePoolInstanceTypeMap, queue, disruption.GracefulDisruptionClass)
1433+
Expect(err).ToNot(HaveOccurred())
1434+
Expect(c).ToNot(BeNil())
1435+
Expect(c.NodeClaim.Name).To(Equal(nodeClaim.Name))
1436+
})
1437+
It("should consider candidates with only daemonset pods as these pods should not be evicted during disruption", func() {
1438+
daemonSet := test.DaemonSet()
1439+
nodeClaim, node := test.NodeClaimAndNode(v1.NodeClaim{
1440+
ObjectMeta: metav1.ObjectMeta{
1441+
Labels: map[string]string{
1442+
v1.NodePoolLabelKey: nodePool.Name,
1443+
corev1.LabelInstanceTypeStable: mostExpensiveInstance.Name,
1444+
v1.CapacityTypeLabelKey: mostExpensiveOffering.Requirements.Get(v1.CapacityTypeLabelKey).Any(),
1445+
corev1.LabelTopologyZone: mostExpensiveOffering.Requirements.Get(corev1.LabelTopologyZone).Any(),
1446+
},
1447+
},
1448+
})
1449+
ExpectApplied(ctx, env.Client, nodePool, nodeClaim, node, daemonSet)
1450+
pod := test.Pod(test.PodOptions{
1451+
ObjectMeta: metav1.ObjectMeta{
1452+
OwnerReferences: []metav1.OwnerReference{
1453+
{
1454+
APIVersion: "apps/v1",
1455+
Kind: "DaemonSet",
1456+
Name: daemonSet.Name,
1457+
UID: daemonSet.UID,
1458+
Controller: lo.ToPtr(true),
1459+
},
1460+
},
1461+
},
1462+
})
1463+
ExpectApplied(ctx, env.Client, pod)
1464+
ExpectManualBinding(ctx, env.Client, pod, node)
1465+
1466+
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim})
1467+
1468+
var err error
1469+
pdbLimits, err = pdb.NewLimits(ctx, env.Client)
1470+
Expect(err).ToNot(HaveOccurred())
1471+
1472+
Expect(cluster.DeepCopyNodes()).To(HaveLen(1))
1473+
c, err := disruption.NewCandidate(ctx, env.Client, recorder, fakeClock, cluster.DeepCopyNodes()[0], pdbLimits, nodePoolMap, nodePoolInstanceTypeMap, queue, disruption.GracefulDisruptionClass)
1474+
// A candidate with only DaemonSet pods should be created successfully
1475+
// DaemonSet pods are not considered reschedulable, so the node should be treated similarly to an empty node
1476+
Expect(err).ToNot(HaveOccurred())
1477+
Expect(c).ToNot(BeNil())
1478+
Expect(c.NodeClaim.Name).To(Equal(nodeClaim.Name))
1479+
Expect(c.Node.Name).To(Equal(node.Name))
14351480
})
14361481
It("should consider candidates that have fully blocking PDBs on mirror pods", func() {
14371482
nodeClaim, node := test.NodeClaimAndNode(v1.NodeClaim{

pkg/controllers/node/termination/suite_test.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,54 @@ var _ = Describe("Termination", func() {
469469
ExpectNotRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // InstanceTerminationValidation
470470
ExpectNotFound(ctx, env.Client, node)
471471
})
472+
It("should not evict daemonset pods during node disruption", func() {
473+
daemonSet := test.DaemonSet()
474+
ExpectApplied(ctx, env.Client, daemonSet)
475+
476+
regularPod := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}})
477+
daemonSetPod := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: []metav1.OwnerReference{{
478+
APIVersion: "apps/v1",
479+
Kind: "DaemonSet",
480+
Name: daemonSet.Name,
481+
UID: daemonSet.UID,
482+
Controller: lo.ToPtr(true),
483+
BlockOwnerDeletion: lo.ToPtr(true),
484+
}}}})
485+
486+
ExpectApplied(ctx, env.Client, node, nodeClaim, regularPod, daemonSetPod)
487+
488+
// Add disrupted taint to node to simulate disruption
489+
node.Spec.Taints = append(node.Spec.Taints, v1.DisruptedNoScheduleTaint)
490+
ExpectApplied(ctx, env.Client, node)
491+
492+
// Trigger Termination Controller
493+
Expect(env.Client.Delete(ctx, node)).To(Succeed())
494+
node = ExpectNodeExists(ctx, env.Client, node.Name)
495+
496+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainInitiation
497+
// Only the regular pod should be added to the eviction queue
498+
ExpectObjectReconciled(ctx, env.Client, queue, regularPod)
499+
500+
// DaemonSet pod should NOT be in the eviction queue
501+
Expect(queue.Has(daemonSetPod)).To(BeFalse())
502+
503+
// Expect node to exist and be draining
504+
ExpectNodeWithNodeClaimDraining(env.Client, node.Name)
505+
506+
// Only regular pod should be terminating
507+
EventuallyExpectTerminating(ctx, env.Client, regularPod)
508+
ExpectDeleted(ctx, env.Client, regularPod)
509+
510+
// DaemonSet pod should still be running (not terminating)
511+
pod := ExpectPodExists(ctx, env.Client, daemonSetPod.Name, daemonSetPod.Namespace)
512+
Expect(pod.DeletionTimestamp).To(BeNil())
513+
514+
// Reconcile to delete node - node should be deleted even with DaemonSet pod still running
515+
node = ExpectNodeExists(ctx, env.Client, node.Name)
516+
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainValidation, VolumeDetachment, InstanceTerminationInitiation
517+
ExpectNotRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // InstanceTerminationValidation
518+
ExpectNotFound(ctx, env.Client, node)
519+
})
472520
It("should evict non-critical pods first", func() {
473521
podEvict := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}})
474522
podNodeCritical := test.Pod(test.PodOptions{NodeName: node.Name, PriorityClassName: "system-node-critical", ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}})

pkg/utils/pod/scheduling.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,13 @@ func IsReschedulable(pod *corev1.Pod) bool {
5252
// - Is an active pod (isn't terminal or actively terminating)
5353
// - Doesn't tolerate the "karpenter.sh/disruption=disrupting" taint
5454
// - Isn't a mirror pod (https://kubernetes.io/docs/tasks/configure-pod-container/static-pod/)
55+
// - Isn't owned by a DaemonSet (DaemonSet pods will be handled by DaemonSet controller)
5556
// - Does not have the "karpenter.sh/do-not-disrupt=true" annotation (https://karpenter.sh/docs/concepts/disruption/#pod-level-controls)
5657
func IsEvictable(pod *corev1.Pod) bool {
5758
return IsActive(pod) &&
5859
!ToleratesDisruptedNoScheduleTaint(pod) &&
5960
!IsOwnedByNode(pod) &&
61+
!IsOwnedByDaemonSet(pod) &&
6062
!HasDoNotDisrupt(pod)
6163
}
6264

@@ -72,14 +74,18 @@ func IsWaitingEviction(pod *corev1.Pod, clk clock.Clock) bool {
7274
// - Doesn't tolerate the "karpenter.sh/disruption=disrupting" taint
7375
// - Isn't a pod that has been terminating past its terminationGracePeriodSeconds
7476
// - Isn't a mirror pod (https://kubernetes.io/docs/tasks/configure-pod-container/static-pod/)
77+
// - Isn't owned by a DaemonSet (DaemonSet pods will be handled by DaemonSet controller)
7578
// Note: pods with the `karpenter.sh/do-not-disrupt` annotation are included since node drain should stall until these pods are evicted or become terminal, even though Karpenter won't orchestrate the eviction.
7679
func IsDrainable(pod *corev1.Pod, clk clock.Clock) bool {
7780
return !ToleratesDisruptedNoScheduleTaint(pod) &&
7881
!IsStuckTerminating(pod, clk) &&
7982
// Mirror pods cannot be deleted through the API server since they are created and managed by kubelet
8083
// This means they are effectively read-only and can't be controlled by API server calls
8184
// https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands#drain
82-
!IsOwnedByNode(pod)
85+
!IsOwnedByNode(pod) &&
86+
// DaemonSet pods should not be drained during disruption as they will be automatically
87+
// managed by the DaemonSet controller and recreated on other nodes when needed
88+
!IsOwnedByDaemonSet(pod)
8389
}
8490

8591
// IsPodEligibleForForcedEviction checks if a pod needs to be deleted with a reduced grace period ensuring that the pod:

0 commit comments

Comments
 (0)