Fix OpenStackClient pod relocation during node failures

stuggi · openshift-cherrypick-robot · commit 1b6be7ba0ea5 · 2025-07-30T14:52:41.000Z
These changes ensure OpenStackClient pods are automatically rescheduled
when nodes fail, instead of requiring manual intervention to delete
stuck pods. The 120-second tolerations provide faster failover compared
to the 5min default, while the stuck pod detection handles edge cases
where normal eviction fails.

- Adds tolerations for faster pod eviction (120s vs 5min default)
  * Handle node.kubernetes.io/not-ready taints
  * Handle node.kubernetes.io/unreachable taints
- Force delete stuck pods with grace period 0

Note:
- going lower then 120s could be too aggressive and result in pod
eviction e.g. during a network issue, or kubelet restarts
- in a follow up same tolerations should be added to the operator
controller manager deployments, since the
openstack-operator-controller-manager is the one handling the
openstackclient pod.

Jira: OSPRH-18450

Signed-off-by: Martin Schuppert &lt;mschuppert@redhat.com&gt;
diff --git a/controllers/client/openstackclient_controller.go b/controllers/client/openstackclient_controller.go
@@ -378,6 +378,18 @@ func (r *OpenStackClientReconciler) Reconcile(ctx context.Context, req ctrl.Requ
 		)
 	}
 
+	// if pod is stuck in terminating state for more than 3 minutes, force delete it
+	if osclient.DeletionTimestamp != nil {
+		terminatingDuration := time.Since(osclient.DeletionTimestamp.Time)
+		if terminatingDuration > time.Minute*3 {
+			// Force delete only truly stuck pods
+			err := r.Client.Delete(ctx, osclient, client.GracePeriodSeconds(0))
+			if err != nil {
+				return ctrl.Result{}, fmt.Errorf("Failed to force delete pod: %w", err)
+			}
+		}
+	}
+
 	podReady := false
 
 	for _, condition := range osclient.Status.Conditions {
diff --git a/pkg/openstackclient/funcs.go b/pkg/openstackclient/funcs.go
@@ -95,6 +95,20 @@ func ClientPodSpec(
 				VolumeMounts: volumeMounts,
 			},
 		},
+		Tolerations: []corev1.Toleration{
+			{
+				Key:               "node.kubernetes.io/not-ready",
+				Operator:          corev1.TolerationOpExists,
+				Effect:            corev1.TaintEffectNoExecute,
+				TolerationSeconds: &[]int64{120}[0],
+			},
+			{
+				Key:               "node.kubernetes.io/unreachable",
+				Operator:          corev1.TolerationOpExists,
+				Effect:            corev1.TaintEffectNoExecute,
+				TolerationSeconds: &[]int64{120}[0],
+			},
+		},
 	}
 
 	if instance.Spec.NodeSelector != nil {