From c22689cca959ba5dbac526090ce7859935a72f61 Mon Sep 17 00:00:00 2001 From: Patryk Diak Date: Mon, 13 Oct 2025 12:20:57 +0200 Subject: [PATCH 1/2] Handle failed CloudPrivateIPConfig assignments When a CPIC is created and the cloud controller externally sets its status to Failed, ovn-kubernetes would not clean up or retry the assignment. Reproducer: 1. CPIC assignment fails 2. The EgressIP status never reflects it as it ignores non-sucessfull assignments 3. Later reconciliation attempts trying to create the same CPIC again 4. Creation failures due to "already exists" errors To fix the issue remove failed CPICs druing CPIC add reconciliation. Signed-off-by: Patryk Diak --- .../pkg/clustermanager/egressip_controller.go | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/go-controller/pkg/clustermanager/egressip_controller.go b/go-controller/pkg/clustermanager/egressip_controller.go index 70ec7a58e2..5199bb66d2 100644 --- a/go-controller/pkg/clustermanager/egressip_controller.go +++ b/go-controller/pkg/clustermanager/egressip_controller.go @@ -36,6 +36,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" utilerrors "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/errors" + "k8s.io/apimachinery/pkg/api/meta" ) const ( @@ -211,6 +212,26 @@ func (eIPC *egressIPClusterController) executeCloudPrivateIPConfigOps(egressIPNa if cloudPrivateIPConfig.GetDeletionTimestamp() != nil && !cloudPrivateIPConfig.GetDeletionTimestamp().IsZero() { return fmt.Errorf("cloud update request failed, CloudPrivateIPConfig: %s is being deleted", cloudPrivateIPConfigName) } + + // Handle a scenario in which the object exists in a failed state by removing it + assignedCondition := meta.FindStatusCondition(cloudPrivateIPConfig.Status.Conditions, string(ocpcloudnetworkapi.Assigned)) + if assignedCondition != nil && assignedCondition.Status == metav1.ConditionFalse { + klog.Warningf("CloudPrivateIPConfig: %s is in Failed state (reason: %s), deleting to allow retry", cloudPrivateIPConfigName, assignedCondition.Message) + eIPRef := corev1.ObjectReference{ + Kind: "EgressIP", + Name: egressIPName, + } + eIPC.recorder.Eventf(&eIPRef, corev1.EventTypeWarning, "CloudAssignmentRetry", + "egress IP: %s previously failed on node %s (reason: %s), will retry assignment", + egressIP, cloudPrivateIPConfig.Spec.Node, assignedCondition.Message) + if err := eIPC.kube.DeleteCloudPrivateIPConfig(cloudPrivateIPConfigName); err != nil { + return fmt.Errorf("failed to delete failed CloudPrivateIPConfig: %s, err: %v", cloudPrivateIPConfigName, err) + } + + // Return an error to trigger retry + return fmt.Errorf("deleted failed CloudPrivateIPConfig: %s, will retry creation in next reconciliation", cloudPrivateIPConfigName) + } + if op.toAdd == cloudPrivateIPConfig.Spec.Node { klog.Infof("CloudPrivateIPConfig: %s already assigned to node: %s", cloudPrivateIPConfigName, cloudPrivateIPConfig.Spec.Node) continue From e628ea72d9e70b8e93aead8624ae938d60af12bd Mon Sep 17 00:00:00 2001 From: Patryk Diak Date: Mon, 20 Oct 2025 13:19:43 +0200 Subject: [PATCH 2/2] wip Signed-off-by: Patryk Diak --- .../pkg/clustermanager/egressip_controller.go | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/go-controller/pkg/clustermanager/egressip_controller.go b/go-controller/pkg/clustermanager/egressip_controller.go index 5199bb66d2..8b3fec43ba 100644 --- a/go-controller/pkg/clustermanager/egressip_controller.go +++ b/go-controller/pkg/clustermanager/egressip_controller.go @@ -1519,18 +1519,20 @@ func (eIPC *egressIPClusterController) reconcileCloudPrivateIPConfig(old, new *o } if new != nil { newCloudPrivateIPConfig = new + assignedCondition := meta.FindStatusCondition(newCloudPrivateIPConfig.Status.Conditions, string(ocpcloudnetworkapi.Assigned)) // We should only proceed to setting things up for objects where the new // object has the same .spec.node and .status.node, and assignment // condition being true. This is how the cloud-network-config-controller // indicates a successful cloud assignment. - shouldAdd = newCloudPrivateIPConfig.Status.Node == newCloudPrivateIPConfig.Spec.Node && - ocpcloudnetworkapi.CloudPrivateIPConfigConditionType(newCloudPrivateIPConfig.Status.Conditions[0].Type) == ocpcloudnetworkapi.Assigned && - corev1.ConditionStatus(newCloudPrivateIPConfig.Status.Conditions[0].Status) == corev1.ConditionTrue + shouldAdd = newCloudPrivateIPConfig.Status.Node == newCloudPrivateIPConfig.Spec.Node && assignedCondition != nil && assignedCondition.Status == metav1.ConditionTrue // See above explanation for the delete shouldDelete = shouldDelete && (newCloudPrivateIPConfig.Status.Node == "" || newCloudPrivateIPConfig.Status.Node != oldCloudPrivateIPConfig.Status.Node) && - ocpcloudnetworkapi.CloudPrivateIPConfigConditionType(newCloudPrivateIPConfig.Status.Conditions[0].Type) == ocpcloudnetworkapi.Assigned && - corev1.ConditionStatus(newCloudPrivateIPConfig.Status.Conditions[0].Status) == corev1.ConditionTrue + assignedCondition != nil && assignedCondition.Status == metav1.ConditionTrue + + // Handle CloudPrivateIPConfigs that transitioned into a failed state + shouldDelete = shouldDelete || assignedCondition != nil && assignedCondition.Status == metav1.ConditionFalse + // On UPDATE we need to delete the old .status.node if shouldDelete { nodeToDelete = oldCloudPrivateIPConfig.Status.Node @@ -1591,7 +1593,7 @@ func (eIPC *egressIPClusterController) reconcileCloudPrivateIPConfig(old, new *o } for _, resyncEgressIP := range resyncEgressIPs { if err := eIPC.reconcileEgressIP(nil, resyncEgressIP); err != nil { - return fmt.Errorf("synthetic update for EgressIP: %s failed, err: %v", egressIP.Name, err) + return fmt.Errorf("synthetic update for EgressIP: %s failed, err: %v", resyncEgressIP.Name, err) } } }