diff --git a/go-controller/pkg/clustermanager/egressip_controller.go b/go-controller/pkg/clustermanager/egressip_controller.go index 70ec7a58e2..8b3fec43ba 100644 --- a/go-controller/pkg/clustermanager/egressip_controller.go +++ b/go-controller/pkg/clustermanager/egressip_controller.go @@ -36,6 +36,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" utilerrors "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/errors" + "k8s.io/apimachinery/pkg/api/meta" ) const ( @@ -211,6 +212,26 @@ func (eIPC *egressIPClusterController) executeCloudPrivateIPConfigOps(egressIPNa if cloudPrivateIPConfig.GetDeletionTimestamp() != nil && !cloudPrivateIPConfig.GetDeletionTimestamp().IsZero() { return fmt.Errorf("cloud update request failed, CloudPrivateIPConfig: %s is being deleted", cloudPrivateIPConfigName) } + + // Handle a scenario in which the object exists in a failed state by removing it + assignedCondition := meta.FindStatusCondition(cloudPrivateIPConfig.Status.Conditions, string(ocpcloudnetworkapi.Assigned)) + if assignedCondition != nil && assignedCondition.Status == metav1.ConditionFalse { + klog.Warningf("CloudPrivateIPConfig: %s is in Failed state (reason: %s), deleting to allow retry", cloudPrivateIPConfigName, assignedCondition.Message) + eIPRef := corev1.ObjectReference{ + Kind: "EgressIP", + Name: egressIPName, + } + eIPC.recorder.Eventf(&eIPRef, corev1.EventTypeWarning, "CloudAssignmentRetry", + "egress IP: %s previously failed on node %s (reason: %s), will retry assignment", + egressIP, cloudPrivateIPConfig.Spec.Node, assignedCondition.Message) + if err := eIPC.kube.DeleteCloudPrivateIPConfig(cloudPrivateIPConfigName); err != nil { + return fmt.Errorf("failed to delete failed CloudPrivateIPConfig: %s, err: %v", cloudPrivateIPConfigName, err) + } + + // Return an error to trigger retry + return fmt.Errorf("deleted failed CloudPrivateIPConfig: %s, will retry creation in next reconciliation", cloudPrivateIPConfigName) + } + if op.toAdd == cloudPrivateIPConfig.Spec.Node { klog.Infof("CloudPrivateIPConfig: %s already assigned to node: %s", cloudPrivateIPConfigName, cloudPrivateIPConfig.Spec.Node) continue @@ -1498,18 +1519,20 @@ func (eIPC *egressIPClusterController) reconcileCloudPrivateIPConfig(old, new *o } if new != nil { newCloudPrivateIPConfig = new + assignedCondition := meta.FindStatusCondition(newCloudPrivateIPConfig.Status.Conditions, string(ocpcloudnetworkapi.Assigned)) // We should only proceed to setting things up for objects where the new // object has the same .spec.node and .status.node, and assignment // condition being true. This is how the cloud-network-config-controller // indicates a successful cloud assignment. - shouldAdd = newCloudPrivateIPConfig.Status.Node == newCloudPrivateIPConfig.Spec.Node && - ocpcloudnetworkapi.CloudPrivateIPConfigConditionType(newCloudPrivateIPConfig.Status.Conditions[0].Type) == ocpcloudnetworkapi.Assigned && - corev1.ConditionStatus(newCloudPrivateIPConfig.Status.Conditions[0].Status) == corev1.ConditionTrue + shouldAdd = newCloudPrivateIPConfig.Status.Node == newCloudPrivateIPConfig.Spec.Node && assignedCondition != nil && assignedCondition.Status == metav1.ConditionTrue // See above explanation for the delete shouldDelete = shouldDelete && (newCloudPrivateIPConfig.Status.Node == "" || newCloudPrivateIPConfig.Status.Node != oldCloudPrivateIPConfig.Status.Node) && - ocpcloudnetworkapi.CloudPrivateIPConfigConditionType(newCloudPrivateIPConfig.Status.Conditions[0].Type) == ocpcloudnetworkapi.Assigned && - corev1.ConditionStatus(newCloudPrivateIPConfig.Status.Conditions[0].Status) == corev1.ConditionTrue + assignedCondition != nil && assignedCondition.Status == metav1.ConditionTrue + + // Handle CloudPrivateIPConfigs that transitioned into a failed state + shouldDelete = shouldDelete || assignedCondition != nil && assignedCondition.Status == metav1.ConditionFalse + // On UPDATE we need to delete the old .status.node if shouldDelete { nodeToDelete = oldCloudPrivateIPConfig.Status.Node @@ -1570,7 +1593,7 @@ func (eIPC *egressIPClusterController) reconcileCloudPrivateIPConfig(old, new *o } for _, resyncEgressIP := range resyncEgressIPs { if err := eIPC.reconcileEgressIP(nil, resyncEgressIP); err != nil { - return fmt.Errorf("synthetic update for EgressIP: %s failed, err: %v", egressIP.Name, err) + return fmt.Errorf("synthetic update for EgressIP: %s failed, err: %v", resyncEgressIP.Name, err) } } }