Skip to content

Commit b179c50

Browse files
authored
Merge pull request #5295 from martinkennelly/fix-eip-syncer
EgressIP: fix startup syncer
2 parents 6bc337b + 053585e commit b179c50

File tree

6 files changed

+667
-70
lines changed

6 files changed

+667
-70
lines changed

go-controller/pkg/ovn/egressip.go

Lines changed: 76 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1138,6 +1138,8 @@ func (e *EgressIPController) isLocalZoneNode(node *corev1.Node) bool {
11381138
type egressIPCache struct {
11391139
// egressIP name -> network name -> cache
11401140
egressIPNameToPods map[string]map[string]selectedPods
1141+
// egressIP name -> to assigned Node names
1142+
egressIPNameToAssignedNodes map[string][]string
11411143
// egressLocalNodes will contain all nodes that are local
11421144
// to this zone which are serving this egressIP object..
11431145
// This will help sync SNATs
@@ -1154,7 +1156,7 @@ type egressIPCache struct {
11541156
}
11551157

11561158
type nodeNetworkRedirects struct {
1157-
// node name -> network name -> redirect IPs
1159+
// network name -> node name -> redirect IPs
11581160
cache map[string]map[string]redirectIPs
11591161
}
11601162

@@ -1444,7 +1446,7 @@ func (e *EgressIPController) syncStaleGWMarkRules(egressIPCache egressIPCache) e
14441446
continue
14451447
}
14461448
for networkName, podCache := range networkPodCache {
1447-
for eIP, nodeName := range egressIPCache.egressIPIPToNodeCache {
1449+
for eIPIP, nodeName := range egressIPCache.egressIPIPToNodeCache {
14481450
if !egressIPCache.egressLocalNodesCache.Has(nodeName) {
14491451
continue
14501452
}
@@ -1458,7 +1460,7 @@ func (e *EgressIPController) syncStaleGWMarkRules(egressIPCache egressIPCache) e
14581460
return fmt.Errorf("failed to create new network %s: %v", networkName, err)
14591461
}
14601462
routerName := ni.GetNetworkScopedGWRouterName(nodeName)
1461-
isEIPIPv6 := utilnet.IsIPv6String(eIP)
1463+
isEIPIPv6 := utilnet.IsIPv6String(eIPIP)
14621464
for podKey, podIPs := range podCache.egressLocalPods {
14631465
ops, err = processPodFn(ops, eIPName, podKey, egressIPCache.markCache[eIPName], routerName, networkName, podIPs, isEIPIPv6)
14641466
if err != nil {
@@ -1600,21 +1602,36 @@ func (e *EgressIPController) syncPodAssignmentCache(egressIPCache egressIPCache)
16001602
// It also removes stale nexthops from router policies used by EgressIPs.
16011603
// Upon failure, it may be invoked multiple times in order to avoid a pod restart.
16021604
func (e *EgressIPController) syncStaleEgressReroutePolicy(cache egressIPCache) error {
1603-
for _, networkCache := range cache.egressIPNameToPods {
1605+
for eipName, networkCache := range cache.egressIPNameToPods {
16041606
for networkName, data := range networkCache {
16051607
logicalRouterPolicyStaleNexthops := []*nbdb.LogicalRouterPolicy{}
1608+
// select LRPs scoped to the correct LRP priority, network and EIP name
16061609
p := func(item *nbdb.LogicalRouterPolicy) bool {
16071610
if item.Priority != types.EgressIPReroutePriority || item.ExternalIDs[libovsdbops.NetworkKey.String()] != networkName {
16081611
return false
16091612
}
1610-
egressIPName, _ := getEIPLRPObjK8MetaData(item.ExternalIDs)
1611-
if egressIPName == "" {
1613+
networkNodeRedirectCache, ok := cache.egressNodeRedirectsCache.cache[networkName]
1614+
if !ok || len(networkNodeRedirectCache) == 0 {
1615+
klog.Infof("syncStaleEgressReroutePolicy found invalid logical router policy (UUID: %s) because no assigned Nodes for EgressIP %s", item.UUID, eipName)
1616+
return true
1617+
}
1618+
extractedEgressIPName, _ := getEIPLRPObjK8MetaData(item.ExternalIDs)
1619+
if extractedEgressIPName == "" {
16121620
klog.Errorf("syncStaleEgressReroutePolicy found logical router policy (UUID: %s) with invalid meta data associated with network %s", item.UUID, networkName)
1613-
return false
1621+
return true
1622+
}
1623+
if extractedEgressIPName != eipName {
1624+
// remove if there's no reference to this EIP name
1625+
_, ok := cache.egressIPNameToPods[extractedEgressIPName]
1626+
return !ok
16141627
}
16151628
splitMatch := strings.Split(item.Match, " ")
1616-
logicalIP := splitMatch[len(splitMatch)-1]
1617-
parsedLogicalIP := net.ParseIP(logicalIP)
1629+
podIPStr := splitMatch[len(splitMatch)-1]
1630+
podIP := net.ParseIP(podIPStr)
1631+
if podIP == nil {
1632+
klog.Infof("syncStaleEgressReroutePolicy found invalid LRP with broken match with UID %q", item.UUID)
1633+
return true
1634+
}
16181635
egressPodIPs := sets.NewString()
16191636
// Since LRPs are created only for pods local to this zone
16201637
// we need to care about only those pods. Nexthop for them will
@@ -1624,31 +1641,24 @@ func (e *EgressIPController) syncStaleEgressReroutePolicy(cache egressIPCache) e
16241641
for _, podIPs := range data.egressLocalPods {
16251642
egressPodIPs.Insert(podIPs.UnsortedList()...)
16261643
}
1627-
if !egressPodIPs.Has(parsedLogicalIP.String()) {
1628-
klog.Infof("syncStaleEgressReroutePolicy will delete %s due to no nexthop or stale logical ip: %v", egressIPName, item)
1644+
if !egressPodIPs.Has(podIP.String()) {
1645+
klog.Infof("syncStaleEgressReroutePolicy will delete %s due to no nexthop or stale logical ip: %v", extractedEgressIPName, item)
16291646
return true
16301647
}
16311648
// Check for stale nexthops that may exist in the logical router policy and store that in logicalRouterPolicyStaleNexthops.
16321649
// Note: adding missing nexthop(s) to the logical router policy is done outside the scope of this function.
16331650
staleNextHops := []string{}
16341651
for _, nexthop := range item.Nexthops {
1635-
nodeName, ok := cache.egressIPIPToNodeCache[parsedLogicalIP.String()]
1636-
if ok {
1637-
klog.Infof("syncStaleEgressReroutePolicy will delete %s due to no node assigned to logical ip: %v", egressIPName, item)
1638-
return true
1639-
}
1640-
networksRedirects, ok := cache.egressNodeRedirectsCache.cache[nodeName]
1641-
if ok {
1642-
klog.Infof("syncStaleEgressReroutePolicy will delete %s due to no network in cache: %v", egressIPName, item)
1643-
return true
1644-
}
1645-
redirects, ok := networksRedirects[networkName]
1646-
if !ok {
1647-
klog.Infof("syncStaleEgressReroutePolicy will delete %s due to no redirects for network in cache: %v", egressIPName, item)
1648-
return true
1652+
// ensure valid next hop by iterating through the node config
1653+
var isFound bool // isFound is true, if the next hop IP is found within the set of assigned nodes
1654+
for _, nodeRedirect := range networkNodeRedirectCache {
1655+
if nodeRedirect.containsIP(nexthop) {
1656+
isFound = true
1657+
break
1658+
}
16491659
}
1650-
//FIXME: be more specific about which is the valid next hop instead of relying on verifying if the IP is within a valid set of IPs.
1651-
if !redirects.containsIP(nexthop) {
1660+
if !isFound {
1661+
//FIXME: be more specific about which is the valid next hop instead of relying on verifying if the IP is within a valid set of IPs.
16521662
staleNextHops = append(staleNextHops, nexthop)
16531663
}
16541664
}
@@ -1669,7 +1679,14 @@ func (e *EgressIPController) syncStaleEgressReroutePolicy(cache egressIPCache) e
16691679

16701680
// Update Logical Router Policies that have stale nexthops. Notice that we must do this separately
16711681
// because logicalRouterPolicyStaleNexthops must be populated first
1672-
klog.Infof("syncStaleEgressReroutePolicy will remove stale nexthops for network %s: %+v", networkName, logicalRouterPolicyStaleNexthops)
1682+
for _, staleNextHopLogicalRouterPolicy := range logicalRouterPolicyStaleNexthops {
1683+
if staleNextHopLogicalRouterPolicy.Nexthop == nil {
1684+
continue
1685+
}
1686+
klog.Infof("syncStaleEgressReroutePolicy will remove stale nexthops for LRP %q for network %s: %s",
1687+
staleNextHopLogicalRouterPolicy.UUID, networkName, *staleNextHopLogicalRouterPolicy.Nexthop)
1688+
}
1689+
16731690
err = libovsdbops.DeleteNextHopsFromLogicalRouterPolicies(e.nbClient, cache.networkToRouter[networkName], logicalRouterPolicyStaleNexthops...)
16741691
if err != nil {
16751692
return fmt.Errorf("unable to remove stale next hops from logical router policies for network %s: %v", networkName, err)
@@ -1699,7 +1716,13 @@ func (e *EgressIPController) syncStaleSNATRules(egressIPCache egressIPCache) err
16991716
return false
17001717
}
17011718
egressIPName := egressIPMeta[0]
1702-
parsedLogicalIP := net.ParseIP(item.LogicalIP).String()
1719+
// check logical IP maps to a valid pod
1720+
parsedPodIP := net.ParseIP(item.LogicalIP)
1721+
if parsedPodIP == nil {
1722+
klog.Errorf("syncStaleSNATRules found invalid logical IP for NAT with UID %q", item.UUID)
1723+
return true
1724+
}
1725+
parsedPodIPStr := parsedPodIP.String()
17031726
cacheEntry, exists := egressIPCache.egressIPNameToPods[egressIPName][types.DefaultNetworkName]
17041727
egressPodIPs := sets.NewString()
17051728
if exists {
@@ -1712,7 +1735,7 @@ func (e *EgressIPController) syncStaleSNATRules(egressIPCache egressIPCache) err
17121735
egressPodIPs.Insert(podIPs.UnsortedList()...)
17131736
}
17141737
}
1715-
if !exists || !egressPodIPs.Has(parsedLogicalIP) {
1738+
if !exists || !egressPodIPs.Has(parsedPodIPStr) {
17161739
klog.Infof("syncStaleSNATRules will delete %s due to logical ip: %v", egressIPName, item)
17171740
return true
17181741
}
@@ -1721,9 +1744,15 @@ func (e *EgressIPController) syncStaleSNATRules(egressIPCache egressIPCache) err
17211744
klog.Errorf("syncStaleSNATRules failed to find default network in networks cache")
17221745
return false
17231746
}
1724-
if node, ok := egressIPCache.egressIPIPToNodeCache[item.ExternalIP]; !ok || !cacheEntry.egressLocalPods[types.DefaultNetworkName].Has(node) ||
1725-
item.LogicalPort == nil || *item.LogicalPort != ni.GetNetworkScopedK8sMgmtIntfName(node) {
1726-
klog.Infof("syncStaleSNATRules will delete %s due to external ip or stale logical port: %v", egressIPName, item)
1747+
// check external IP maps to a valid EgressIP IP and its assigned to a Node
1748+
node, ok := egressIPCache.egressIPIPToNodeCache[item.ExternalIP]
1749+
if !ok {
1750+
klog.Infof("syncStaleSNATRules found NAT %q without EIP assigned to a Node", item.UUID)
1751+
return true
1752+
}
1753+
// check logical port is set and correspondes to the correct egress node
1754+
if item.LogicalPort == nil || *item.LogicalPort != ni.GetNetworkScopedK8sMgmtIntfName(node) {
1755+
klog.Infof("syncStaleSNATRules found NAT %q with invalid logical port", item.UUID)
17271756
return true
17281757
}
17291758
return false
@@ -1907,9 +1936,12 @@ func (e *EgressIPController) generateCacheForEgressIP() (egressIPCache, error) {
19071936
// This will help sync SNATs
19081937
egressLocalNodesCache := sets.New[string]()
19091938
cache.egressLocalNodesCache = egressLocalNodesCache
1910-
// egressIP name -> node name
1911-
egressNodesCache := make(map[string]string, 0)
1912-
cache.egressIPIPToNodeCache = egressNodesCache
1939+
// egressIP name -> nodes where the IPs are assigned
1940+
egressIPNameNodesCache := make(map[string][]string, 0)
1941+
cache.egressIPNameToAssignedNodes = egressIPNameNodesCache
1942+
// egressIP IP -> node name. Assigned node for EIP.
1943+
egressIPIPNodeCache := make(map[string]string, 0)
1944+
cache.egressIPIPToNodeCache = egressIPIPNodeCache
19131945
cache.markCache = make(map[string]string)
19141946
egressIPs, err := e.watchFactory.GetEgressIPs()
19151947
if err != nil {
@@ -1922,11 +1954,18 @@ func (e *EgressIPController) generateCacheForEgressIP() (egressIPCache, error) {
19221954
}
19231955
cache.markCache[egressIP.Name] = mark.String()
19241956
egressIPsCache[egressIP.Name] = make(map[string]selectedPods, 0)
1957+
egressIPNameNodesCache[egressIP.Name] = make([]string, 0, len(egressIP.Status.Items))
19251958
for _, status := range egressIP.Status.Items {
1959+
eipIP := net.ParseIP(status.EgressIP)
1960+
if eipIP == nil {
1961+
klog.Errorf("Failed to parse EgressIP %s IP %q from status", egressIP.Name, status.EgressIP)
1962+
continue
1963+
}
1964+
egressIPIPNodeCache[eipIP.String()] = status.Node
19261965
if localZoneNodes.Has(status.Node) {
19271966
egressLocalNodesCache.Insert(status.Node)
19281967
}
1929-
egressNodesCache[status.EgressIP] = status.Node
1968+
egressIPNameNodesCache[egressIP.Name] = append(egressIPNameNodesCache[egressIP.Name], status.Node)
19301969
}
19311970
namespaces, err = e.watchFactory.GetNamespacesBySelector(egressIP.Spec.NamespaceSelector)
19321971
if err != nil {

0 commit comments

Comments
 (0)