Skip to content

Commit 68db55e

Browse files
EIP OVN startup syncer: fix processing of ovn constructs
The startup syncer was removing OVN constructs due to logic bugs introduced when EIP code was refactored for UDN. The are added again when eip controller syncs but this causes interruption. 1. Due to poor naming, enforcement of types and programmer error we were mixing up variables between a pod IP and an EIP IP. See: nodeName, ok := cache.egressIPIPToNodeCache[parsedLogicalIP.String()] parsedLogicalIP is a pod IP and not an EIP IP. 2. When iterating over the existing config for an EIP, we should delete config for LRPs where an EIP doesn't exist. 3. Remove LRPs when a network isnt found Signed-off-by: Martin Kennelly <[email protected]>
1 parent 7588fd3 commit 68db55e

File tree

1 file changed

+49
-29
lines changed

1 file changed

+49
-29
lines changed

go-controller/pkg/ovn/egressip.go

Lines changed: 49 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1138,6 +1138,8 @@ func (e *EgressIPController) isLocalZoneNode(node *corev1.Node) bool {
11381138
type egressIPCache struct {
11391139
// egressIP name -> network name -> cache
11401140
egressIPNameToPods map[string]map[string]selectedPods
1141+
// egressIP name -> to assigned Node names
1142+
egressIPNameToAssignedNodes map[string][]string
11411143
// egressLocalNodes will contain all nodes that are local
11421144
// to this zone which are serving this egressIP object..
11431145
// This will help sync SNATs
@@ -1154,7 +1156,7 @@ type egressIPCache struct {
11541156
}
11551157

11561158
type nodeNetworkRedirects struct {
1157-
// node name -> network name -> redirect IPs
1159+
// network name -> node name -> redirect IPs
11581160
cache map[string]map[string]redirectIPs
11591161
}
11601162

@@ -1600,21 +1602,36 @@ func (e *EgressIPController) syncPodAssignmentCache(egressIPCache egressIPCache)
16001602
// It also removes stale nexthops from router policies used by EgressIPs.
16011603
// Upon failure, it may be invoked multiple times in order to avoid a pod restart.
16021604
func (e *EgressIPController) syncStaleEgressReroutePolicy(cache egressIPCache) error {
1603-
for _, networkCache := range cache.egressIPNameToPods {
1605+
for eipName, networkCache := range cache.egressIPNameToPods {
16041606
for networkName, data := range networkCache {
16051607
logicalRouterPolicyStaleNexthops := []*nbdb.LogicalRouterPolicy{}
1608+
// select LRPs scoped to the correct LRP priority, network and EIP name
16061609
p := func(item *nbdb.LogicalRouterPolicy) bool {
16071610
if item.Priority != types.EgressIPReroutePriority || item.ExternalIDs[libovsdbops.NetworkKey.String()] != networkName {
16081611
return false
16091612
}
1610-
egressIPName, _ := getEIPLRPObjK8MetaData(item.ExternalIDs)
1611-
if egressIPName == "" {
1613+
networkNodeRedirectCache, ok := cache.egressNodeRedirectsCache.cache[networkName]
1614+
if !ok || len(networkNodeRedirectCache) == 0 {
1615+
klog.Infof("syncStaleEgressReroutePolicy found invalid logical router policy (UUID: %s) because no assigned Nodes for EgressIP %s", item.UUID, eipName)
1616+
return true
1617+
}
1618+
extractedEgressIPName, _ := getEIPLRPObjK8MetaData(item.ExternalIDs)
1619+
if extractedEgressIPName == "" {
16121620
klog.Errorf("syncStaleEgressReroutePolicy found logical router policy (UUID: %s) with invalid meta data associated with network %s", item.UUID, networkName)
1613-
return false
1621+
return true
1622+
}
1623+
if extractedEgressIPName != eipName {
1624+
// remove if there's no reference to this EIP name
1625+
_, ok := cache.egressIPNameToPods[extractedEgressIPName]
1626+
return !ok
16141627
}
16151628
splitMatch := strings.Split(item.Match, " ")
1616-
logicalIP := splitMatch[len(splitMatch)-1]
1617-
parsedLogicalIP := net.ParseIP(logicalIP)
1629+
podIPStr := splitMatch[len(splitMatch)-1]
1630+
podIP := net.ParseIP(podIPStr)
1631+
if podIP == nil {
1632+
klog.Infof("syncStaleEgressReroutePolicy found invalid LRP with broken match with UID %q", item.UUID)
1633+
return true
1634+
}
16181635
egressPodIPs := sets.NewString()
16191636
// Since LRPs are created only for pods local to this zone
16201637
// we need to care about only those pods. Nexthop for them will
@@ -1624,31 +1641,24 @@ func (e *EgressIPController) syncStaleEgressReroutePolicy(cache egressIPCache) e
16241641
for _, podIPs := range data.egressLocalPods {
16251642
egressPodIPs.Insert(podIPs.UnsortedList()...)
16261643
}
1627-
if !egressPodIPs.Has(parsedLogicalIP.String()) {
1628-
klog.Infof("syncStaleEgressReroutePolicy will delete %s due to no nexthop or stale logical ip: %v", egressIPName, item)
1644+
if !egressPodIPs.Has(podIP.String()) {
1645+
klog.Infof("syncStaleEgressReroutePolicy will delete %s due to no nexthop or stale logical ip: %v", extractedEgressIPName, item)
16291646
return true
16301647
}
16311648
// Check for stale nexthops that may exist in the logical router policy and store that in logicalRouterPolicyStaleNexthops.
16321649
// Note: adding missing nexthop(s) to the logical router policy is done outside the scope of this function.
16331650
staleNextHops := []string{}
16341651
for _, nexthop := range item.Nexthops {
1635-
nodeName, ok := cache.egressIPIPToNodeCache[parsedLogicalIP.String()]
1636-
if ok {
1637-
klog.Infof("syncStaleEgressReroutePolicy will delete %s due to no node assigned to logical ip: %v", egressIPName, item)
1638-
return true
1639-
}
1640-
networksRedirects, ok := cache.egressNodeRedirectsCache.cache[nodeName]
1641-
if ok {
1642-
klog.Infof("syncStaleEgressReroutePolicy will delete %s due to no network in cache: %v", egressIPName, item)
1643-
return true
1644-
}
1645-
redirects, ok := networksRedirects[networkName]
1646-
if !ok {
1647-
klog.Infof("syncStaleEgressReroutePolicy will delete %s due to no redirects for network in cache: %v", egressIPName, item)
1648-
return true
1652+
// ensure valid next hop by iterating through the node config
1653+
var isFound bool // isFound is true, if the next hop IP is found within the set of assigned nodes
1654+
for _, nodeRedirect := range networkNodeRedirectCache {
1655+
if nodeRedirect.containsIP(nexthop) {
1656+
isFound = true
1657+
break
1658+
}
16491659
}
1650-
//FIXME: be more specific about which is the valid next hop instead of relying on verifying if the IP is within a valid set of IPs.
1651-
if !redirects.containsIP(nexthop) {
1660+
if !isFound {
1661+
//FIXME: be more specific about which is the valid next hop instead of relying on verifying if the IP is within a valid set of IPs.
16521662
staleNextHops = append(staleNextHops, nexthop)
16531663
}
16541664
}
@@ -1907,9 +1917,12 @@ func (e *EgressIPController) generateCacheForEgressIP() (egressIPCache, error) {
19071917
// This will help sync SNATs
19081918
egressLocalNodesCache := sets.New[string]()
19091919
cache.egressLocalNodesCache = egressLocalNodesCache
1910-
// egressIP name -> node name
1911-
egressNodesCache := make(map[string]string, 0)
1912-
cache.egressIPIPToNodeCache = egressNodesCache
1920+
// egressIP name -> nodes where the IPs are assigned
1921+
egressIPNameNodesCache := make(map[string][]string, 0)
1922+
cache.egressIPNameToAssignedNodes = egressIPNameNodesCache
1923+
// egressIP IP -> node name. Assigned node for EIP.
1924+
egressIPIPNodeCache := make(map[string]string, 0)
1925+
cache.egressIPIPToNodeCache = egressIPIPNodeCache
19131926
cache.markCache = make(map[string]string)
19141927
egressIPs, err := e.watchFactory.GetEgressIPs()
19151928
if err != nil {
@@ -1922,11 +1935,18 @@ func (e *EgressIPController) generateCacheForEgressIP() (egressIPCache, error) {
19221935
}
19231936
cache.markCache[egressIP.Name] = mark.String()
19241937
egressIPsCache[egressIP.Name] = make(map[string]selectedPods, 0)
1938+
egressIPNameNodesCache[egressIP.Name] = make([]string, 0, len(egressIP.Status.Items))
19251939
for _, status := range egressIP.Status.Items {
1940+
eipIP := net.ParseIP(status.EgressIP)
1941+
if eipIP == nil {
1942+
klog.Errorf("Failed to parse EgressIP %s IP %q from status", egressIP.Name, status.EgressIP)
1943+
continue
1944+
}
1945+
egressIPIPNodeCache[eipIP.String()] = status.Node
19261946
if localZoneNodes.Has(status.Node) {
19271947
egressLocalNodesCache.Insert(status.Node)
19281948
}
1929-
egressNodesCache[status.EgressIP] = status.Node
1949+
egressIPNameNodesCache[egressIP.Name] = append(egressIPNameNodesCache[egressIP.Name], status.Node)
19301950
}
19311951
namespaces, err = e.watchFactory.GetNamespacesBySelector(egressIP.Spec.NamespaceSelector)
19321952
if err != nil {

0 commit comments

Comments
 (0)