Skip to content

Commit 4b75652

Browse files
kyrtapzpperiyasamy
authored andcommitted
egressip: fix race condition when a node becomes reachable
Fix a race condition where egress IP assignments could be lost when: 1. A node becomes reachable and is added to the retry queue. 2. Before the periodicallyRetryResources executes(up to 30s) a node update event occurs and does nothing as there is no change to node readiness. 3. The update removes the retry entry, causing pending egress IPs to not be assigned. 4. Add lock for fakeEgressIPHealthClient (Peri) The fix triggers immediate retry queue processing via RequestRetryObjs after adding the reachable node to the queue, ensuring egress IP assignments are processed before any competing node update events. Co-Authored-By: Periyasamy Palanisamy <[email protected]> Signed-off-by: Patryk Diak <[email protected]> Signed-off-by: Arti Sood <[email protected]>
1 parent b681345 commit 4b75652

File tree

2 files changed

+36
-10
lines changed

2 files changed

+36
-10
lines changed

go-controller/pkg/clustermanager/egressip_controller.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -615,9 +615,13 @@ func checkEgressNodesReachabilityIterate(eIPC *egressIPClusterController) {
615615
nodeToAdd, err := eIPC.watchFactory.GetNode(nodeName)
616616
if err != nil {
617617
klog.Errorf("Node: %s is detected as reachable and ready again, but could not re-assign egress IPs, err: %v", nodeName, err)
618-
} else if err := eIPC.retryEgressNodes.AddRetryObjWithAddNoBackoff(nodeToAdd); err != nil {
618+
continue
619+
}
620+
if err := eIPC.retryEgressNodes.AddRetryObjWithAddNoBackoff(nodeToAdd); err != nil {
619621
klog.Errorf("Node: %s is detected as reachable and ready again, but could not re-assign egress IPs, err: %v", nodeName, err)
622+
continue
620623
}
624+
eIPC.retryEgressNodes.RequestRetryObjs()
621625
}
622626
}
623627
}

go-controller/pkg/clustermanager/egressip_controller_test.go

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"net"
77
"strconv"
8+
"sync"
89
"time"
910

1011
"github.com/onsi/ginkgo/v2"
@@ -36,13 +37,18 @@ type fakeEgressIPHealthClient struct {
3637
Connected bool
3738
ProbeCount int
3839
FakeProbeFailure bool
40+
mutex sync.Mutex
3941
}
4042

4143
func (fehc *fakeEgressIPHealthClient) IsConnected() bool {
44+
fehc.mutex.Lock()
45+
defer fehc.mutex.Unlock()
4246
return fehc.Connected
4347
}
4448

4549
func (fehc *fakeEgressIPHealthClient) Connect(context.Context, []net.IP, int) bool {
50+
fehc.mutex.Lock()
51+
defer fehc.mutex.Unlock()
4652
if fehc.FakeProbeFailure {
4753
return false
4854
}
@@ -51,18 +57,34 @@ func (fehc *fakeEgressIPHealthClient) Connect(context.Context, []net.IP, int) bo
5157
}
5258

5359
func (fehc *fakeEgressIPHealthClient) Disconnect() {
60+
fehc.mutex.Lock()
61+
defer fehc.mutex.Unlock()
5462
fehc.Connected = false
5563
fehc.ProbeCount = 0
5664
}
5765

5866
func (fehc *fakeEgressIPHealthClient) Probe(context.Context) bool {
67+
fehc.mutex.Lock()
68+
defer fehc.mutex.Unlock()
5969
if fehc.Connected && !fehc.FakeProbeFailure {
6070
fehc.ProbeCount++
6171
return true
6272
}
6373
return false
6474
}
6575

76+
func (fehc *fakeEgressIPHealthClient) getProbeCount() int {
77+
fehc.mutex.Lock()
78+
defer fehc.mutex.Unlock()
79+
return fehc.ProbeCount
80+
}
81+
82+
func (fehc *fakeEgressIPHealthClient) setFakeProbeFailure(probeFailure bool) {
83+
fehc.mutex.Lock()
84+
defer fehc.mutex.Unlock()
85+
fehc.FakeProbeFailure = probeFailure
86+
}
87+
6688
type fakeEgressIPHealthClientAllocator struct{}
6789

6890
func (f *fakeEgressIPHealthClientAllocator) allocate(string) healthcheck.EgressIPHealthClient {
@@ -1122,26 +1144,26 @@ var _ = ginkgo.Describe("OVN cluster-manager EgressIP Operations", func() {
11221144

11231145
if !prevNodeIsConnected && !currNodeIsConnected {
11241146
// Not connected (before and after): no probes should be successful
1125-
gomega.Expect(hcc.ProbeCount).To(gomega.Equal(prevProbes), desc)
1147+
gomega.Expect(hcc.getProbeCount()).To(gomega.Equal(prevProbes), desc)
11261148
} else if prevNodeIsConnected && currNodeIsConnected {
11271149
if failProbes {
11281150
// Still connected, but no probes should be successful
1129-
gomega.Expect(prevProbes).To(gomega.Equal(hcc.ProbeCount), desc)
1151+
gomega.Expect(prevProbes).To(gomega.Equal(hcc.getProbeCount()), desc)
11301152
} else {
11311153
// Still connected and probe counters should be going up
1132-
gomega.Expect(prevProbes).To(gomega.BeNumerically("<", hcc.ProbeCount), desc)
1154+
gomega.Expect(prevProbes).To(gomega.BeNumerically("<", hcc.getProbeCount()), desc)
11331155
}
11341156
}
11351157
}
11361158

11371159
for _, tt := range tests {
1138-
hcc1.FakeProbeFailure = tt.node1FailProbes
1139-
hcc2.FakeProbeFailure = tt.node2FailProbes
1160+
hcc1.setFakeProbeFailure(tt.node1FailProbes)
1161+
hcc2.setFakeProbeFailure(tt.node2FailProbes)
11401162

11411163
prevNode1IsConnected := hcc1.IsConnected()
11421164
prevNode2IsConnected := hcc2.IsConnected()
1143-
prevNode1Probes := hcc1.ProbeCount
1144-
prevNode2Probes := hcc2.ProbeCount
1165+
prevNode1Probes := hcc1.getProbeCount()
1166+
prevNode2Probes := hcc2.getProbeCount()
11451167

11461168
if tt.tcPrepareFunc != nil {
11471169
tt.tcPrepareFunc(hcc1, hcc2)
@@ -1686,12 +1708,12 @@ var _ = ginkgo.Describe("OVN cluster-manager EgressIP Operations", func() {
16861708
egressIPs, _ := getEgressIPStatus(eIP1.Name)
16871709
gomega.Expect(egressIPs[0]).To(gomega.Equal(egressIP))
16881710
hcClient := fakeClusterManagerOVN.eIPC.nodeAllocator.cache[node.Name].healthClient.(*fakeEgressIPHealthClient)
1689-
hcClient.FakeProbeFailure = true
1711+
hcClient.setFakeProbeFailure(true)
16901712
// explicitly call check reachability, periodic checker is not active
16911713
checkEgressNodesReachabilityIterate(fakeClusterManagerOVN.eIPC)
16921714
gomega.Eventually(getEgressIPStatusLen(eIP1.Name)).Should(gomega.Equal(0))
16931715

1694-
hcClient.FakeProbeFailure = false
1716+
hcClient.setFakeProbeFailure(false)
16951717
node.Annotations["test"] = "dummy"
16961718
_, err = fakeClusterManagerOVN.fakeClient.KubeClient.CoreV1().Nodes().Update(context.TODO(), &node, metav1.UpdateOptions{})
16971719
gomega.Expect(err).NotTo(gomega.HaveOccurred())

0 commit comments

Comments
 (0)