Skip to content

Commit 744eced

Browse files
authored
Merge pull request #5070 from trozet/fix_cm_node
Limit cluster manager node allocator updates/logs
2 parents ab2bbed + 562f752 commit 744eced

26 files changed

+327
-448
lines changed

go-controller/pkg/clustermanager/clustermanager.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,6 @@ import (
2525
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util"
2626
)
2727

28-
const (
29-
// ID of the default network.
30-
defaultNetworkID = 0
31-
)
32-
3328
// ClusterManager structure is the object which manages the cluster nodes.
3429
// It creates a default network controller for the default network and a
3530
// secondary network cluster controller manager to manage the multi networks.

go-controller/pkg/clustermanager/clustermanager_test.go

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -123,23 +123,6 @@ var _ = ginkgo.Describe("Cluster Manager", func() {
123123
}, 2).Should(gomega.HaveLen(1))
124124
}
125125

126-
// Check that the network id 0 is allocated for the default network
127-
for _, n := range nodes {
128-
gomega.Eventually(func() error {
129-
updatedNode, err := fakeClient.KubeClient.CoreV1().Nodes().Get(context.TODO(), n.Name, metav1.GetOptions{})
130-
if err != nil {
131-
return err
132-
}
133-
134-
networkId, err := util.ParseNetworkIDAnnotation(updatedNode, "default")
135-
if err != nil {
136-
return fmt.Errorf("expected node network id annotation for node %s to have been allocated", n.Name)
137-
}
138-
139-
gomega.Expect(networkId).To(gomega.Equal(0))
140-
return nil
141-
}).ShouldNot(gomega.HaveOccurred())
142-
}
143126
return nil
144127
}
145128

go-controller/pkg/clustermanager/network_cluster_controller.go

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ func newDefaultNetworkClusterController(netInfo util.NetInfo, ovnClient *util.OV
120120
// defaiult network
121121
networkIDAllocator := id.NewIDAllocator(types.DefaultNetworkName, 1)
122122
// Reserve the id 0 for the default network.
123-
err := networkIDAllocator.ReserveID(types.DefaultNetworkName, defaultNetworkID)
123+
err := networkIDAllocator.ReserveID(types.DefaultNetworkName, types.DefaultNetworkID)
124124
if err != nil {
125125
panic(fmt.Errorf("could not reserve default network ID: %w", err))
126126
}
@@ -175,7 +175,7 @@ func (ncc *networkClusterController) init() error {
175175
if util.DoesNetworkRequireTunnelIDs(ncc.GetNetInfo()) {
176176
ncc.tunnelIDAllocator = id.NewIDAllocator(ncc.GetNetworkName(), types.MaxLogicalPortTunnelKey)
177177
// Reserve the id 0. We don't want to assign this id to any of the pods or nodes.
178-
if err = ncc.tunnelIDAllocator.ReserveID("zero", util.NoID); err != nil {
178+
if err = ncc.tunnelIDAllocator.ReserveID("zero", types.NoTunnelID); err != nil {
179179
return err
180180
}
181181
if util.IsNetworkSegmentationSupportEnabled() && ncc.IsPrimaryNetwork() {
@@ -196,7 +196,7 @@ func (ncc *networkClusterController) init() error {
196196
node.Name, ncc.GetNetworkName(), err)
197197
}
198198
}
199-
if tunnelID != util.InvalidID {
199+
if tunnelID != types.InvalidID {
200200
if err := ncc.tunnelIDAllocator.ReserveID(ncc.GetNetworkName()+"_"+node.Name, tunnelID); err != nil {
201201
return fmt.Errorf("unable to reserve id for network %s, node %s: %w", ncc.GetNetworkName(), node.Name, err)
202202
}
@@ -489,6 +489,8 @@ type networkClusterControllerEventHandler struct {
489489
objType reflect.Type
490490
ncc *networkClusterController
491491
syncFunc func([]interface{}) error
492+
493+
nodeSyncFailed sync.Map
492494
}
493495

494496
func (h *networkClusterControllerEventHandler) FilterOutResource(_ interface{}) bool {
@@ -520,6 +522,9 @@ func (h *networkClusterControllerEventHandler) AddResource(obj interface{}, _ bo
520522
err = h.ncc.nodeAllocator.HandleAddUpdateNodeEvent(node)
521523
if err == nil {
522524
h.clearInitialNodeNetworkUnavailableCondition(node)
525+
h.nodeSyncFailed.Delete(node.Name)
526+
} else {
527+
h.nodeSyncFailed.Store(node.Name, true)
523528
}
524529
statusErr := h.ncc.updateNetworkStatus(node.Name, err)
525530
joinedErr := errors.Join(err, statusErr)
@@ -557,19 +562,35 @@ func (h *networkClusterControllerEventHandler) UpdateResource(oldObj, newObj int
557562
return err
558563
}
559564
case factory.NodeType:
560-
node, ok := newObj.(*corev1.Node)
565+
oldNode, ok := oldObj.(*corev1.Node)
566+
if !ok {
567+
return fmt.Errorf("could not cast %T object to *corev1.Node", oldObj)
568+
}
569+
newNode, ok := newObj.(*corev1.Node)
561570
if !ok {
562571
return fmt.Errorf("could not cast %T object to *corev1.Node", newObj)
563572
}
564-
err = h.ncc.nodeAllocator.HandleAddUpdateNodeEvent(node)
573+
_, nodeFailed := h.nodeSyncFailed.Load(newNode.GetName())
574+
// Note: (trozet) It might be pedantic to check if the NeedsNodeAllocation. This assumes one of the following:
575+
// 1. we missed an add event (bug in kapi informer code)
576+
// 2. a user removed the annotation on the node
577+
// Either way to play it safe for now do a partial json unmarshal check
578+
if !nodeFailed && util.NoHostSubnet(oldNode) != util.NoHostSubnet(newNode) && !h.ncc.nodeAllocator.NeedsNodeAllocation(newNode) {
579+
// no other node updates would require us to reconcile again
580+
return nil
581+
}
582+
err = h.ncc.nodeAllocator.HandleAddUpdateNodeEvent(newNode)
565583
if err == nil {
566-
h.clearInitialNodeNetworkUnavailableCondition(node)
584+
h.clearInitialNodeNetworkUnavailableCondition(newNode)
585+
h.nodeSyncFailed.Delete(newNode.GetName())
586+
} else {
587+
h.nodeSyncFailed.Store(newNode.Name, true)
567588
}
568-
statusErr := h.ncc.updateNetworkStatus(node.Name, err)
589+
statusErr := h.ncc.updateNetworkStatus(newNode.Name, err)
569590
joinedErr := errors.Join(err, statusErr)
570591
if joinedErr != nil {
571592
klog.Infof("Cluster Manager Network Controller %q: Node update failed for %s, will try again later: %v",
572-
h.ncc.GetNetworkName(), node.Name, err)
593+
h.ncc.GetNetworkName(), newNode.Name, err)
573594
return err
574595
}
575596
case factory.IPAMClaimsType:
@@ -600,7 +621,12 @@ func (h *networkClusterControllerEventHandler) DeleteResource(obj, _ interface{}
600621
}
601622
err := h.ncc.nodeAllocator.HandleDeleteNode(node)
602623
statusErr := h.ncc.updateNetworkStatus(node.Name, err)
603-
return errors.Join(err, statusErr)
624+
jErr := errors.Join(err, statusErr)
625+
if jErr != nil {
626+
return jErr
627+
}
628+
h.nodeSyncFailed.Delete(node.Name)
629+
return nil
604630
case factory.IPAMClaimsType:
605631
ipamClaim, ok := obj.(*ipamclaimsapi.IPAMClaim)
606632
if !ok {

go-controller/pkg/clustermanager/node/node_allocator.go

Lines changed: 85 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,43 @@ func (na *NodeAllocator) releaseHybridOverlayNodeSubnet(nodeName string) {
175175
klog.Infof("Deleted hybrid overlay HostSubnets for node %s", nodeName)
176176
}
177177

178+
// NeedsNodeAllocation determines if the annotations that are assigned by NodeAllocator are missing on a node
179+
func (na *NodeAllocator) NeedsNodeAllocation(node *corev1.Node) bool {
180+
// hybrid overlay check
181+
if util.NoHostSubnet(node) {
182+
if na.hasHybridOverlayAllocation() {
183+
if _, ok := node.Annotations[hotypes.HybridOverlayNodeSubnet]; !ok {
184+
return true
185+
}
186+
}
187+
return false
188+
}
189+
190+
// ovn node check
191+
// allocation is all or nothing, so if one field was allocated from:
192+
// nodeSubnets, joinSubnet, layer 2 tunnel id, then all of them were
193+
if na.hasNodeSubnetAllocation() {
194+
if util.HasNodeHostSubnetAnnotation(node, na.netInfo.GetNetworkName()) {
195+
return false
196+
}
197+
}
198+
199+
if na.hasJoinSubnetAllocation() {
200+
if util.HasNodeGatewayRouterJoinNetwork(node, na.netInfo.GetNetworkName()) {
201+
return false
202+
}
203+
}
204+
205+
if util.IsNetworkSegmentationSupportEnabled() && na.netInfo.IsPrimaryNetwork() && util.DoesNetworkRequireTunnelIDs(na.netInfo) {
206+
if util.HasUDNLayer2NodeGRLRPTunnelID(node, na.netInfo.GetNetworkName()) {
207+
return false
208+
}
209+
}
210+
211+
return true
212+
213+
}
214+
178215
// HandleAddUpdateNodeEvent handles the add or update node event
179216
func (na *NodeAllocator) HandleAddUpdateNodeEvent(node *corev1.Node) error {
180217
defer na.recordSubnetUsage()
@@ -202,14 +239,21 @@ func (na *NodeAllocator) HandleAddUpdateNodeEvent(node *corev1.Node) error {
202239

203240
// syncNodeNetworkAnnotations does 2 things
204241
// - syncs the node's allocated subnets in the node subnet annotation
205-
// - syncs the network id in the node network id annotation
242+
// - syncs the join subnet annotation
243+
// - syncs the layer 2 tunnel id annotation
244+
// - syncs the network id in the node network id annotation (legacy)
206245
func (na *NodeAllocator) syncNodeNetworkAnnotations(node *corev1.Node) error {
207246
networkName := na.netInfo.GetNetworkName()
208247

209248
networkID, err := util.ParseNetworkIDAnnotation(node, networkName)
210-
if err != nil && !util.IsAnnotationNotSetError(err) {
211-
// Log the error and try to allocate new subnets
212-
klog.Warningf("Failed to get node %s network id annotations for network %s : %v", node.Name, networkName, err)
249+
if err != nil {
250+
if !util.IsAnnotationNotSetError(err) {
251+
// Log the error and try to allocate new subnets
252+
klog.Warningf("Failed to get node %s network id annotations for network %s : %v", node.Name, networkName, err)
253+
}
254+
// if there is an error we are going to set networkID here to NoNetworkID to prevent
255+
// the annotation being updated
256+
networkID = types.NoNetworkID
213257
}
214258

215259
updatedSubnetsMap := map[string][]*net.IPNet{}
@@ -280,14 +324,14 @@ func (na *NodeAllocator) syncNodeNetworkAnnotations(node *corev1.Node) error {
280324
updatedSubnetsMap[networkName] = validExistingSubnets
281325
}
282326
}
283-
newTunnelID := util.NoID
327+
newTunnelID := types.NoTunnelID
284328
if util.IsNetworkSegmentationSupportEnabled() && na.netInfo.IsPrimaryNetwork() && util.DoesNetworkRequireTunnelIDs(na.netInfo) {
285329
existingTunnelID, err := util.ParseUDNLayer2NodeGRLRPTunnelIDs(node, networkName)
286330
if err != nil && !util.IsAnnotationNotSetError(err) {
287331
return fmt.Errorf("failed to fetch tunnelID annotation from the node %s for network %s, err: %v",
288332
node.Name, networkName, err)
289333
}
290-
if existingTunnelID == util.InvalidID {
334+
if existingTunnelID == types.InvalidID {
291335
if newTunnelID, err = na.idAllocator.AllocateID(networkName + "_" + node.Name); err != nil {
292336
return fmt.Errorf("failed to assign node %s tunnel id for network %s: %w", node.Name, networkName, err)
293337
}
@@ -301,14 +345,24 @@ func (na *NodeAllocator) syncNodeNetworkAnnotations(node *corev1.Node) error {
301345
}
302346
}
303347

348+
// only update node annotation with ID if it had it before (networkID is not NoNetworkID)
349+
// and does not match.
350+
// NoNetworkID means do not update the annotation
351+
if networkID != types.NoNetworkID && networkID != na.networkID {
352+
networkID = na.networkID
353+
} else if networkID == na.networkID {
354+
// don't need to update if there was no change to the ID
355+
networkID = types.NoNetworkID
356+
}
357+
304358
// Also update the node annotation if the networkID doesn't match
305-
if len(updatedSubnetsMap) > 0 || na.networkID != networkID || len(allocatedJoinSubnets) > 0 || newTunnelID != util.NoID {
306-
err = na.updateNodeNetworkAnnotationsWithRetry(node.Name, updatedSubnetsMap, na.networkID, newTunnelID, allocatedJoinSubnets)
359+
if len(updatedSubnetsMap) > 0 || networkID != types.NoNetworkID || len(allocatedJoinSubnets) > 0 || newTunnelID != types.NoTunnelID {
360+
err = na.updateNodeNetworkAnnotationsWithRetry(node.Name, updatedSubnetsMap, networkID, newTunnelID, allocatedJoinSubnets)
307361
if err != nil {
308362
if errR := na.clusterSubnetAllocator.ReleaseNetworks(node.Name, allocatedSubnets...); errR != nil {
309363
klog.Warningf("Error releasing node %s subnets: %v", node.Name, errR)
310364
}
311-
if util.IsNetworkSegmentationSupportEnabled() && na.netInfo.IsPrimaryNetwork() && util.DoesNetworkRequireTunnelIDs(na.netInfo) {
365+
if newTunnelID != types.NoTunnelID {
312366
na.idAllocator.ReleaseID(networkName + "_" + node.Name)
313367
klog.Infof("Releasing node %s tunnelID for network %s since annotation update failed", node.Name, networkName)
314368
}
@@ -406,12 +460,14 @@ func (na *NodeAllocator) updateNodeNetworkAnnotationsWithRetry(nodeName string,
406460
return fmt.Errorf("failed to update node %q annotation LRPAddrAnnotation %s: %w",
407461
node.Name, util.JoinIPNets(joinAddr, ","), err)
408462
}
409-
cnode.Annotations, err = util.UpdateNetworkIDAnnotation(cnode.Annotations, networkName, networkId)
410-
if err != nil {
411-
return fmt.Errorf("failed to update node %q network id annotation %d for network %s: %w",
412-
node.Name, networkId, networkName, err)
463+
if networkId != types.NoNetworkID {
464+
cnode.Annotations, err = util.UpdateNetworkIDAnnotation(cnode.Annotations, networkName, networkId)
465+
if err != nil {
466+
return fmt.Errorf("failed to update node %q network id annotation %d for network %s: %w",
467+
node.Name, networkId, networkName, err)
468+
}
413469
}
414-
if tunnelID != util.NoID {
470+
if tunnelID != types.NoTunnelID {
415471
cnode.Annotations, err = util.UpdateUDNLayer2NodeGRLRPTunnelIDs(cnode.Annotations, networkName, tunnelID)
416472
if err != nil {
417473
return fmt.Errorf("failed to update node %q tunnel id annotation %d for network %s: %w",
@@ -448,7 +504,7 @@ func (na *NodeAllocator) Cleanup() error {
448504

449505
hostSubnetsMap := map[string][]*net.IPNet{networkName: nil}
450506
// passing util.InvalidID deletes the network/tunnel id annotation for the network.
451-
err = na.updateNodeNetworkAnnotationsWithRetry(node.Name, hostSubnetsMap, util.InvalidID, util.InvalidID, nil)
507+
err = na.updateNodeNetworkAnnotationsWithRetry(node.Name, hostSubnetsMap, types.InvalidID, types.InvalidID, nil)
452508
if err != nil {
453509
return fmt.Errorf("failed to clear node %q subnet annotation for network %s",
454510
node.Name, networkName)
@@ -472,8 +528,6 @@ func (na *NodeAllocator) allocateNodeSubnets(allocator SubnetAllocator, nodeName
472528
expectedHostSubnets = 2
473529
}
474530

475-
klog.Infof("Expected %d subnets on node %s, found %d: %v", expectedHostSubnets, nodeName, len(existingSubnets), existingSubnets)
476-
477531
// If any existing subnets the node has are valid, mark them as reserved.
478532
// The node might have invalid or already-reserved subnets, or it might
479533
// have more subnets than configured in OVN (like for dual-stack to/from
@@ -486,7 +540,6 @@ func (na *NodeAllocator) allocateNodeSubnets(allocator SubnetAllocator, nodeName
486540
for _, subnet := range existingSubnets {
487541
if (ipv4Mode && utilnet.IsIPv4CIDR(subnet) && !foundIPv4) || (ipv6Mode && utilnet.IsIPv6CIDR(subnet) && !foundIPv6) {
488542
if err := allocator.MarkAllocatedNetworks(nodeName, subnet); err == nil {
489-
klog.Infof("Valid subnet %v allocated on node %s", subnet, nodeName)
490543
existingSubnets[n] = subnet
491544
n++
492545
if utilnet.IsIPv4CIDR(subnet) {
@@ -498,17 +551,18 @@ func (na *NodeAllocator) allocateNodeSubnets(allocator SubnetAllocator, nodeName
498551
}
499552
}
500553
// this subnet is no longer needed; release it
501-
klog.Infof("Releasing unused or invalid subnet %v on node %s", subnet, nodeName)
554+
klog.Infof("Releasing unused or invalid subnet %v on node %s, network %s",
555+
subnet, na.netInfo.GetNetworkName(), nodeName)
502556
if err := allocator.ReleaseNetworks(nodeName, subnet); err != nil {
503-
klog.Warningf("Failed to release subnet %v on node %s: %v", subnet, nodeName, err)
557+
klog.Warningf("Failed to release subnet %v on node %s, network %s: %v",
558+
subnet, nodeName, na.netInfo.GetNetworkName(), err)
504559
}
505560
}
506561
// recreate existingSubnets with the valid subnets
507562
existingSubnets = existingSubnets[:n]
508563

509564
// Node has enough valid subnets already allocated
510565
if len(existingSubnets) == expectedHostSubnets {
511-
klog.Infof("Allowed existing subnets %v on node %s", existingSubnets, nodeName)
512566
return existingSubnets, allocatedSubnets, nil
513567
}
514568

@@ -517,9 +571,10 @@ func (na *NodeAllocator) allocateNodeSubnets(allocator SubnetAllocator, nodeName
517571
defer func() {
518572
if releaseAllocatedSubnets {
519573
for _, subnet := range allocatedSubnets {
520-
klog.Warningf("Releasing subnet %v on node %s", subnet, nodeName)
574+
klog.Warningf("Releasing subnet %v on node %s, network: %s", subnet, nodeName, na.netInfo.GetNetworkName())
521575
if errR := allocator.ReleaseNetworks(nodeName, subnet); errR != nil {
522-
klog.Warningf("Error releasing subnet %v on node %s: %v", subnet, nodeName, errR)
576+
klog.Warningf("Error releasing subnet %v on node %s, network %s: %v",
577+
subnet, na.netInfo.GetNetworkName(), nodeName, errR)
523578
}
524579
}
525580
}
@@ -528,12 +583,14 @@ func (na *NodeAllocator) allocateNodeSubnets(allocator SubnetAllocator, nodeName
528583
// allocateOneSubnet is a helper to process the result of a subnet allocation
529584
allocateOneSubnet := func(allocatedHostSubnet *net.IPNet, allocErr error) error {
530585
if allocErr != nil {
531-
return fmt.Errorf("error allocating network for node %s: %v", nodeName, allocErr)
586+
return fmt.Errorf("error allocating network for node %s, network name %s: %v",
587+
nodeName, na.netInfo.GetNetworkName(), allocErr)
532588
}
533589
// the allocator returns nil if it can't provide a subnet
534590
// we should filter them out or they will be appended to the slice
535591
if allocatedHostSubnet != nil {
536-
klog.V(5).Infof("Allocating subnet %v on node %s", allocatedHostSubnet, nodeName)
592+
klog.V(5).Infof("Allocating subnet %v on node %s for network: %s",
593+
allocatedHostSubnet, nodeName, na.netInfo.GetNetworkName())
537594
allocatedSubnets = append(allocatedSubnets, allocatedHostSubnet)
538595
}
539596
return nil
@@ -556,12 +613,12 @@ func (na *NodeAllocator) allocateNodeSubnets(allocator SubnetAllocator, nodeName
556613
// so it will require a reconfiguration and restart.
557614
wantedSubnets := expectedHostSubnets - len(existingSubnets)
558615
if wantedSubnets > 0 && len(allocatedSubnets) != wantedSubnets {
559-
return nil, nil, fmt.Errorf("error allocating networks for node %s: %d subnets expected only new %d subnets allocated",
560-
nodeName, expectedHostSubnets, len(allocatedSubnets))
616+
return nil, nil, fmt.Errorf("error allocating networks for network: %s, node %s: %d subnets expected only new %d subnets allocated",
617+
na.netInfo.GetNetworkName(), nodeName, expectedHostSubnets, len(allocatedSubnets))
561618
}
562619

563620
hostSubnets := append(existingSubnets, allocatedSubnets...)
564-
klog.Infof("Allocated Subnets %v on Node %s", hostSubnets, nodeName)
621+
klog.Infof("Allocated Subnets %v on Node %s for Network: %s", hostSubnets, nodeName, na.netInfo.GetNetworkName())
565622

566623
// Success; prevent the release-on-error from triggering and return all node subnets
567624
releaseAllocatedSubnets = false

0 commit comments

Comments
 (0)