Skip to content

Commit e1af9e9

Browse files
committed
Fix panic/race around openflowmanager
This panic was observed in CI [1]. It looks related to a nil pointer. It is difficult to explain because everything should have been initialized already at that point. My only explanation is that the netConfig map is being reallocated and we hit a nil pointer in the meantime. Protecting access to that map as it shoudl have been already. 1. "nce\n[signal SIGSEGV: segmentation violation code=0x1 addr=0x80 pc=0x2287ff3]\n\ngoroutine 112590 [running]:\nk8s.io/apimachinery/pkg/util/runtime.handleCrash({0x30098f8, 0x4ea1680}, {0x26e5ee0, 0x47fc2b0}, {0x4ea1680, 0x0, 0x43cba5?})\n\t/go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/apimachinery/pkg/util/runtime/runtime.go:89 +0xee\nk8s.io/apimachinery/pkg/util/runtime.HandleCrash({0x0, 0x0, 0xc00b3f0a80?})\n\t/go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/apimachinery/pkg/util/runtime/runtime.go:59 +0x108\npanic({0x26e5ee0?, 0x47fc2b0?})\n\t/usr/lib/golang/src/runtime/panic.go:785 +0x132\ngithub.com/ovn-org/ovn-kubernetes/go-controller/pkg/node.(*UserDefinedNetworkGateway).doReconcile(0xc00fc49170)\n\t/go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/node/gateway_udn.go:922 +0x173\nk8s.io/client-go/util/retry.OnError.func1()\n\t/go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/client-go/util/retry/util.go:51 +0x30\nk8s.io/apimachinery/pkg/util/wait.runConditionWithCrashProtection(0x989680?)\n\t/go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/apimachinery/pkg/util/wait/wait.go:145 +0x3e\nk8s.io/apimachinery/pkg/util/wait.ExponentialBackoff({0x2faf080, 0x4014000000000000, 0x0, 0x3, 0x0}, 0xc005133ef0)\n\t/go/src/github.com/openshift/ovn-kubernetes/go-controller", Signed-off-by: Jaime Caamaño Ruiz <[email protected]>
1 parent 7e0aa60 commit e1af9e9

File tree

3 files changed

+20
-5
lines changed

3 files changed

+20
-5
lines changed

go-controller/pkg/node/gateway_shared_intf.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ func (npw *nodePortWatcher) updateServiceFlowCache(service *corev1.Service, netI
422422
ipPrefix = "ipv6"
423423
}
424424
// table 2, user-defined network host -> OVN towards default cluster network services
425-
defaultNetConfig := npw.ofm.defaultBridge.getActiveNetworkBridgeConfig(types.DefaultNetworkName)
425+
defaultNetConfig := npw.ofm.defaultBridge.getActiveNetworkBridgeConfigCopy(types.DefaultNetworkName)
426426
// sample flow: cookie=0xdeff105, duration=2319.685s, table=2, n_packets=496, n_bytes=67111, priority=300,
427427
// ip,nw_dst=10.96.0.1 actions=mod_dl_dst:02:42:ac:12:00:03,output:"patch-breth0_ov"
428428
// This flow is used for UDNs and advertised UDNs to be able to reach kapi and dns services alone on default network

go-controller/pkg/node/gateway_udn.go

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,12 +146,18 @@ func (b *bridgeConfiguration) delNetworkBridgeConfig(nInfo util.NetInfo) {
146146
delete(b.netConfig, nInfo.GetNetworkName())
147147
}
148148

149-
// getActiveNetworkBridgeConfig returns a shallow copy of the network configuration corresponding to the
149+
func (b *bridgeConfiguration) getNetworkBridgeConfig(networkName string) *bridgeUDNConfiguration {
150+
b.Lock()
151+
defer b.Unlock()
152+
return b.netConfig[networkName]
153+
}
154+
155+
// getActiveNetworkBridgeConfigCopy returns a shallow copy of the network configuration corresponding to the
150156
// provided netInfo.
151157
//
152158
// NOTE: if the network configuration can't be found or if the network is not patched by OVN
153159
// yet this returns nil.
154-
func (b *bridgeConfiguration) getActiveNetworkBridgeConfig(networkName string) *bridgeUDNConfiguration {
160+
func (b *bridgeConfiguration) getActiveNetworkBridgeConfigCopy(networkName string) *bridgeUDNConfiguration {
155161
b.Lock()
156162
defer b.Unlock()
157163

@@ -917,9 +923,18 @@ func (udng *UserDefinedNetworkGateway) Reconcile() {
917923
func (udng *UserDefinedNetworkGateway) doReconcile() error {
918924
klog.Infof("Reconciling gateway with updates for UDN %s", udng.GetNetworkName())
919925

926+
// shouldn't happen
927+
if udng.openflowManager == nil || udng.openflowManager.defaultBridge == nil {
928+
return fmt.Errorf("openflow manager with default bridge configuration has not been provided for network %s", udng.GetNetworkName())
929+
}
930+
920931
// update bridge configuration
921932
isNetworkAdvertised := util.IsPodNetworkAdvertisedAtNode(udng.NetInfo, udng.node.Name)
922-
udng.openflowManager.defaultBridge.netConfig[udng.GetNetworkName()].advertised.Store(isNetworkAdvertised)
933+
netConfig := udng.openflowManager.defaultBridge.getNetworkBridgeConfig(udng.GetNetworkName())
934+
if netConfig == nil {
935+
return fmt.Errorf("missing bridge configuration for network %s", udng.GetNetworkName())
936+
}
937+
netConfig.advertised.Store(isNetworkAdvertised)
923938

924939
if err := udng.updateUDNVRFIPRules(isNetworkAdvertised); err != nil {
925940
return fmt.Errorf("error while updating ip rule for UDN %s: %s", udng.GetNetworkName(), err)

go-controller/pkg/node/openflow_manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ func (c *openflowManager) delNetwork(nInfo util.NetInfo) {
5959
}
6060

6161
func (c *openflowManager) getActiveNetwork(nInfo util.NetInfo) *bridgeUDNConfiguration {
62-
return c.defaultBridge.getActiveNetworkBridgeConfig(nInfo.GetNetworkName())
62+
return c.defaultBridge.getActiveNetworkBridgeConfigCopy(nInfo.GetNetworkName())
6363
}
6464

6565
// END UDN UTILs

0 commit comments

Comments
 (0)