Skip to content

Commit 40c250a

Browse files
committed
Fix OpenStackClient pod relocation during node failures
These changes ensure OpenStackClient pods are automatically rescheduled when nodes fail, instead of requiring manual intervention to delete stuck pods. The 120-second tolerations provide faster failover compared to the 5min default, while the stuck pod detection handles edge cases where normal eviction fails. - Adds tolerations for faster pod eviction (120s vs 5min default) * Handle node.kubernetes.io/not-ready taints * Handle node.kubernetes.io/unreachable taints - Force delete stuck pods with grace period 0 Note: - going lower then 120s could be too aggressive and result in pod eviction e.g. during a network issue, or kubelet restarts - in a follow up same tolerations should be added to the operator controller manager deployments, since the openstack-operator-controller-manager is the one handling the openstackclient pod. Jira: OSPRH-18450 Signed-off-by: Martin Schuppert <[email protected]>
1 parent c4bce29 commit 40c250a

File tree

2 files changed

+26
-0
lines changed

2 files changed

+26
-0
lines changed

controllers/client/openstackclient_controller.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,18 @@ func (r *OpenStackClientReconciler) Reconcile(ctx context.Context, req ctrl.Requ
378378
)
379379
}
380380

381+
// if pod is stuck in terminating state for more than 3 minutes, force delete it
382+
if osclient.DeletionTimestamp != nil {
383+
terminatingDuration := time.Since(osclient.DeletionTimestamp.Time)
384+
if terminatingDuration > time.Minute*3 {
385+
// Force delete only truly stuck pods
386+
err := r.Client.Delete(ctx, osclient, client.GracePeriodSeconds(0))
387+
if err != nil {
388+
return ctrl.Result{}, fmt.Errorf("Failed to force delete pod: %w", err)
389+
}
390+
}
391+
}
392+
381393
podReady := false
382394

383395
for _, condition := range osclient.Status.Conditions {

pkg/openstackclient/funcs.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,20 @@ func ClientPodSpec(
9595
VolumeMounts: volumeMounts,
9696
},
9797
},
98+
Tolerations: []corev1.Toleration{
99+
{
100+
Key: "node.kubernetes.io/not-ready",
101+
Operator: corev1.TolerationOpExists,
102+
Effect: corev1.TaintEffectNoExecute,
103+
TolerationSeconds: &[]int64{120}[0],
104+
},
105+
{
106+
Key: "node.kubernetes.io/unreachable",
107+
Operator: corev1.TolerationOpExists,
108+
Effect: corev1.TaintEffectNoExecute,
109+
TolerationSeconds: &[]int64{120}[0],
110+
},
111+
},
98112
}
99113

100114
if instance.Spec.NodeSelector != nil {

0 commit comments

Comments
 (0)