From 9600774c37e8e545497b7856446531a791f62dfb Mon Sep 17 00:00:00 2001 From: "Behzad.Mirkhanzadeh" Date: Wed, 2 Jul 2025 18:51:33 -0700 Subject: [PATCH 1/2] Fix: adding support for SWiftV2 Linux stateless CNI mode --- cni/network/network.go | 51 ++++++---- network/endpoint_linux.go | 9 ++ network/endpoint_windows.go | 5 + network/manager.go | 39 ++++++-- network/manager_mock.go | 17 +++- network/manager_test.go | 110 ++++++++++++++++++++- network/secondary_endpoint_client_linux.go | 81 ++++++++++++++- 7 files changed, 279 insertions(+), 33 deletions(-) diff --git a/cni/network/network.go b/cni/network/network.go index a3fa3784d9..73ddf4de7e 100644 --- a/cni/network/network.go +++ b/cni/network/network.go @@ -18,7 +18,6 @@ import ( "github.com/Azure/azure-container-networking/cni/util" "github.com/Azure/azure-container-networking/cns" cnscli "github.com/Azure/azure-container-networking/cns/client" - "github.com/Azure/azure-container-networking/cns/fsnotify" "github.com/Azure/azure-container-networking/common" "github.com/Azure/azure-container-networking/dhcp" "github.com/Azure/azure-container-networking/iptables" @@ -719,7 +718,7 @@ func (plugin *NetPlugin) createEpInfo(opt *createEpInfoOpt) (*network.EndpointIn *opt.infraSeen = true } else { ifName = "eth" + strconv.Itoa(opt.endpointIndex) - endpointID = plugin.nm.GetEndpointID(opt.args.ContainerID, ifName) + endpointID = plugin.nm.GetEndpointIDByNicType(opt.args.ContainerID, ifName, opt.ifInfo.NICType) } endpointInfo := network.EndpointInfo{ @@ -1075,33 +1074,45 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error { if plugin.nm.IsStatelessCNIMode() { // network ID is passed in and used only for migration // otherwise, in stateless, we don't need the network id for deletion - epInfos, err = plugin.nm.GetEndpointState(networkID, args.ContainerID) - // if stateless CNI fail to get the endpoint from CNS for any reason other than Endpoint Not found + epInfos, err = plugin.nm.GetEndpointState(networkID, args.ContainerID, args.Netns) + // if stateless CNI fail to get the endpoint from CNS for any reason other than Endpoint Not found or CNS connection failure + // return a retriable error so the container runtime will retry this DEL later + // the implementation of this function returns nil if the endpoint doesn't exist, so + // we don't have to check that here if err != nil { - // async delete should be disabled for standalone scenario - if errors.Is(err, network.ErrConnectionFailure) && !nwCfg.DisableAsyncDelete { - logger.Info("failed to connect to CNS", zap.String("containerID", args.ContainerID), zap.Error(err)) - addErr := fsnotify.AddFile(args.ContainerID, args.ContainerID, watcherPath) - logger.Info("add containerid file for Asynch delete", zap.String("containerID", args.ContainerID), zap.Error(addErr)) - if addErr != nil { - logger.Error("failed to add file to watcher", zap.String("containerID", args.ContainerID), zap.Error(addErr)) - return errors.Wrap(addErr, fmt.Sprintf("failed to add file to watcher with containerID %s", args.ContainerID)) + switch { + case errors.Is(err, network.ErrConnectionFailure): + logger.Error("Failed to connect to CNS", zap.Error(err)) + logger.Info("Endpoint will be deleted from state file asynchronously", zap.String("containerID", args.ContainerID)) + // In SwiftV2 Linux stateless CNI mode, if the plugin cannot connect to CNS, + // we asynchronously remove the secondary (delegated) interface from the pod’s network namespace in the absence of the endpoint state. + // This is necessary because leaving the delegated NIC in the pod netns can cause the kernel to block rtnetlink operations. + // When that happens, kubelet and containerd hang during sandbox creation or teardown. + // The delegated NIC (SR-IOV VF) used by SwiftV2 for multitenant pods remains tied to the pod namespace, + // triggering hot-unplug/re-register events and leaving the node in an unhealthy state. + // This workaround mitigates the issue by removing the secondary NIC from the pod netns when CNS is unreachable during DEL to provide the endpoint state. + if err = plugin.nm.RemoveSecondaryEndpointFromPodNetNS(args.IfName, args.Netns); err != nil { + logger.Error("Failed to remove secondary endpoint from pod netns", zap.String("netns", args.Netns), zap.Error(err)) + return plugin.RetriableError(fmt.Errorf("failed to remove secondary endpoint from pod netns: %w", err)) } - return nil - } - if errors.Is(err, network.ErrEndpointStateNotFound) { + case errors.Is(err, network.ErrEndpointStateNotFound): logger.Info("Endpoint Not found", zap.String("containerID", args.ContainerID), zap.Error(err)) return nil + default: + logger.Error("Get Endpoint State API returned error", zap.String("containerID", args.ContainerID), zap.Error(err)) + return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err)) + } + } else { + for _, epInfo := range epInfos { + logger.Info("Found endpoint to delete", zap.String("IfName", epInfo.IfName), zap.String("EndpointID", epInfo.EndpointID), zap.Any("NICType", epInfo.NICType)) } - logger.Error("Get Endpoint State API returned error", zap.String("containerID", args.ContainerID), zap.Error(err)) - return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err)) } } else { epInfos = plugin.nm.GetEndpointInfosFromContainerID(args.ContainerID) } - // for when the endpoint is not created, but the ips are already allocated (only works if single network, single infra) - // this block is not applied to stateless CNI + // for Stateful CNI when the endpoint is not created, but the ips are already allocated (only works if single network, single infra) + // this block is applied to stateless CNI only if there was a connection failure in previous block and asynchronous delete by CNS will remover the endpoint from state file if len(epInfos) == 0 { endpointID := plugin.nm.GetEndpointID(args.ContainerID, args.IfName) if !nwCfg.MultiTenancy { @@ -1127,7 +1138,7 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error { if err = plugin.nm.DeleteEndpoint(epInfo.NetworkID, epInfo.EndpointID, epInfo); err != nil { // An error will not be returned if the endpoint is not found // return a retriable error so the container runtime will retry this DEL later - // the implementation of this function returns nil if the endpoint doens't exist, so + // the implementation of this function returns nil if the endpoint doesn't exist, so // we don't have to check that here return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err)) } diff --git a/network/endpoint_linux.go b/network/endpoint_linux.go index 5f57a66d51..e2b7d9fd76 100644 --- a/network/endpoint_linux.go +++ b/network/endpoint_linux.go @@ -547,3 +547,12 @@ func getDefaultGateway(routes []RouteInfo) net.IP { func (epInfo *EndpointInfo) GetEndpointInfoByIPImpl(_ []net.IPNet, _ string) (*EndpointInfo, error) { return epInfo, nil } + +// removeSecondaryEndpointFromPodNetNSImpl deletes an existing secondary endpoint from the pod network namespace. +func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(nsc NamespaceClientInterface) error { + secondaryepClient := NewSecondaryEndpointClient(nil, nil, nil, nsc, nil, ep) + if err := secondaryepClient.RemoveInterfacesFromNetnsPath(ep.IfName, ep.NetworkNameSpace); err != nil { + return err + } + return nil +} diff --git a/network/endpoint_windows.go b/network/endpoint_windows.go index 7b842dfe32..810e3f0e43 100644 --- a/network/endpoint_windows.go +++ b/network/endpoint_windows.go @@ -770,3 +770,8 @@ func getPnpDeviceState(instanceID string, plc platform.ExecClient) (string, stri logger.Info("Retrieved device problem code", zap.String("code", devpkeyDeviceProblemCode)) return devpkeyDeviceIsPresent, devpkeyDeviceProblemCode, nil } + +// removeSecondaryEndpointFromPodNetNSImpl removes an existing secondary endpoint from the pod network namespace. +func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(_ NamespaceClientInterface) error { + return nil +} diff --git a/network/manager.go b/network/manager.go index bef8858087..b5ca612a30 100644 --- a/network/manager.go +++ b/network/manager.go @@ -116,11 +116,13 @@ type NetworkManager interface { UpdateEndpoint(networkID string, existingEpInfo *EndpointInfo, targetEpInfo *EndpointInfo) error GetNumberOfEndpoints(ifName string, networkID string) int GetEndpointID(containerID, ifName string) string + GetEndpointIDByNicType(containerID, ifName string, nicType cns.NICType) string IsStatelessCNIMode() bool SaveState(eps []*endpoint) error DeleteState(epInfos []*EndpointInfo) error GetEndpointInfosFromContainerID(containerID string) []*EndpointInfo - GetEndpointState(networkID, containerID string) ([]*EndpointInfo, error) + GetEndpointState(networkID, containerID, netns string) ([]*EndpointInfo, error) + RemoveSecondaryEndpointFromPodNetNS(ifName string, netns string) error } // Creates a new network manager. @@ -456,7 +458,7 @@ func validateUpdateEndpointState(endpointID string, ifNameToIPInfoMap map[string // GetEndpointState will make a call to CNS GetEndpointState API in the stateless CNI mode to fetch the endpointInfo // TODO unit tests need to be added, WorkItem: 26606939 // In stateless cni, container id is the endpoint id, so you can pass in either -func (nm *networkManager) GetEndpointState(networkID, containerID string) ([]*EndpointInfo, error) { +func (nm *networkManager) GetEndpointState(networkID, containerID, netns string) ([]*EndpointInfo, error) { endpointResponse, err := nm.CnsClient.GetEndpoint(context.TODO(), containerID) if err != nil { if endpointResponse.Response.ReturnCode == types.NotFound { @@ -467,7 +469,7 @@ func (nm *networkManager) GetEndpointState(networkID, containerID string) ([]*En } return nil, ErrGetEndpointStateFailure } - epInfos := cnsEndpointInfotoCNIEpInfos(endpointResponse.EndpointInfo, containerID) + epInfos := cnsEndpointInfotoCNIEpInfos(endpointResponse.EndpointInfo, containerID, netns) for i := 0; i < len(epInfos); i++ { if epInfos[i].NICType == cns.InfraNIC { @@ -515,7 +517,7 @@ func (nm *networkManager) DeleteEndpointStateless(networkID string, epInfo *Endp nw := &network{ Id: networkID, // currently unused in stateless cni HnsId: epInfo.HNSNetworkID, - Mode: opModeTransparentVlan, + Mode: opModeTransparent, SnatBridgeIP: "", NetNs: dummyGUID, // to trigger hns v2, windows extIf: &externalInterface{ @@ -530,6 +532,7 @@ func (nm *networkManager) DeleteEndpointStateless(networkID string, epInfo *Endp HNSNetworkID: epInfo.HNSNetworkID, // unused (we use nw.HnsId for deleting the network) HostIfName: epInfo.HostIfName, LocalIP: "", + IPAddresses: epInfo.IPAddresses, VlanID: 0, AllowInboundFromHostToNC: false, // stateless currently does not support apipa AllowInboundFromNCToHost: false, @@ -538,11 +541,12 @@ func (nm *networkManager) DeleteEndpointStateless(networkID string, epInfo *Endp NetworkContainerID: epInfo.NetworkContainerID, // we don't use this as long as AllowInboundFromHostToNC and AllowInboundFromNCToHost are false NetNs: dummyGUID, // to trigger hnsv2, windows NICType: epInfo.NICType, + NetworkNameSpace: epInfo.NetNsPath, IfName: epInfo.IfName, // TODO: For stateless cni linux populate IfName here to use in deletion in secondary endpoint client } logger.Info("Deleting endpoint with", zap.String("Endpoint Info: ", epInfo.PrettyString()), zap.String("HNISID : ", ep.HnsId)) - err := nw.deleteEndpointImpl(netlink.NewNetlink(), platform.NewExecClient(logger), nil, nil, nil, nil, nil, ep) + err := nw.deleteEndpointImpl(nm.netlink, nm.plClient, nil, nm.netio, nm.nsClient, nm.iptablesClient, nm.dhcpClient, ep) if err != nil { return err } @@ -563,7 +567,7 @@ func (nm *networkManager) GetEndpointInfo(networkID, endpointID string) (*Endpoi if nm.IsStatelessCNIMode() { logger.Info("calling cns getEndpoint API") - epInfos, err := nm.GetEndpointState(networkID, endpointID) + epInfos, err := nm.GetEndpointState(networkID, endpointID, "") if err != nil { return nil, err } @@ -746,6 +750,16 @@ func (nm *networkManager) GetEndpointID(containerID, ifName string) string { return containerID + "-" + ifName } +// GetEndpointIDByNicType returns a unique endpoint ID based on the CNI mode and NIC type. +func (nm *networkManager) GetEndpointIDByNicType(containerID, ifName string, nicType cns.NICType) string { + // For stateless CNI, secondary NICs use containerID-ifName as endpointID. + if nm.IsStatelessCNIMode() && nicType != cns.InfraNIC { + return containerID + "-" + ifName + } + // For InfraNIC, use GetEndpointID() logic. + return nm.GetEndpointID(containerID, ifName) +} + // saves the map of network ids to endpoints to the state file func (nm *networkManager) SaveState(eps []*endpoint) error { nm.Lock() @@ -797,7 +811,7 @@ func (nm *networkManager) DeleteState(epInfos []*EndpointInfo) error { } // called to convert a cns restserver EndpointInfo into a network EndpointInfo -func cnsEndpointInfotoCNIEpInfos(endpointInfo restserver.EndpointInfo, endpointID string) []*EndpointInfo { +func cnsEndpointInfotoCNIEpInfos(endpointInfo restserver.EndpointInfo, endpointID, netns string) []*EndpointInfo { ret := []*EndpointInfo{} for ifName, ipInfo := range endpointInfo.IfnameToIPMap { @@ -867,3 +881,14 @@ func generateCNSIPInfoMap(eps []*endpoint) map[string]*restserver.IPInfo { return ifNametoIPInfoMap } + +// RemoveSecondaryEndpointFromPodNetNS removes the secondary endpoint from the pod netns +func (nm *networkManager) RemoveSecondaryEndpointFromPodNetNS(ifName, netns string) error { + ep := &endpoint{ + NetworkNameSpace: netns, + IfName: ifName, // TODO: For stateless cni linux populate IfName here to use in deletion in secondary endpoint client + } + logger.Info("Removing Secondary Endpoint from", zap.String("NetworkNameSpace: ", netns)) + err := ep.removeSecondaryEndpointFromPodNetNSImpl(nm.nsClient) + return err +} diff --git a/network/manager_mock.go b/network/manager_mock.go index 52ba4f3bc4..7a2686b285 100644 --- a/network/manager_mock.go +++ b/network/manager_mock.go @@ -1,6 +1,7 @@ package network import ( + "github.com/Azure/azure-container-networking/cns" "github.com/Azure/azure-container-networking/common" ) @@ -94,6 +95,16 @@ func (nm *MockNetworkManager) GetEndpointID(containerID, ifName string) string { return containerID + "-" + ifName } +// GetEndpointIDByNicType returns a unique endpoint ID based on the CNI mode and NIC type. +func (nm *MockNetworkManager) GetEndpointIDByNicType(containerID, ifName string, nicType cns.NICType) string { + // For stateless CNI, secondary NICs use containerID-ifName as endpointID. + if nm.IsStatelessCNIMode() && nicType != cns.InfraNIC { + return containerID + "-" + ifName + } + // For InfraNIC, use GetEndpointID() logic. + return nm.GetEndpointID(containerID, ifName) +} + func (nm *MockNetworkManager) GetAllEndpoints(networkID string) (map[string]*EndpointInfo, error) { return nm.TestEndpointInfoMap, nil } @@ -207,6 +218,10 @@ func (nm *MockNetworkManager) GetEndpointInfosFromContainerID(containerID string return ret } -func (nm *MockNetworkManager) GetEndpointState(_, _ string) ([]*EndpointInfo, error) { +func (nm *MockNetworkManager) GetEndpointState(_, _, _ string) ([]*EndpointInfo, error) { return []*EndpointInfo{}, nil } + +func (nm *MockNetworkManager) RemoveSecondaryEndpointFromPodNetNS(_, _ string) error { + return nil +} diff --git a/network/manager_test.go b/network/manager_test.go index 8c8545b97a..4cc79a8d8b 100644 --- a/network/manager_test.go +++ b/network/manager_test.go @@ -350,7 +350,7 @@ var _ = Describe("Test Manager", func() { PodNamespace: "test-pod-ns", } - epInfos := cnsEndpointInfotoCNIEpInfos(cnsEndpointInfo, endpointID) + epInfos := cnsEndpointInfotoCNIEpInfos(cnsEndpointInfo, endpointID, "") Expect(len(epInfos)).To(Equal(1)) Expect(epInfos[0]).To(Equal( @@ -400,7 +400,7 @@ var _ = Describe("Test Manager", func() { PodNamespace: "test-pod-ns", } - epInfos := cnsEndpointInfotoCNIEpInfos(cnsEndpointInfo, endpointID) + epInfos := cnsEndpointInfotoCNIEpInfos(cnsEndpointInfo, endpointID, "") Expect(len(epInfos)).To(Equal(2)) Expect(epInfos).To(ContainElement( @@ -489,3 +489,109 @@ var _ = Describe("Test Manager", func() { }) }) }) + +func TestGetEndpointIDByNicType_Cases(t *testing.T) { + nm := &networkManager{} + + cases := []struct { + name string + stateless bool + containerID string + ifName string + nicType cns.NICType + expectedResult string + }{ + { + name: "Stateless InfraNIC", + stateless: true, + containerID: "container123", + ifName: "eth0", + nicType: cns.InfraNIC, + expectedResult: "container123", + }, + { + name: "Stateless SecondaryNIC", + stateless: true, + containerID: "container123", + ifName: "eth1", + nicType: cns.DelegatedVMNIC, + expectedResult: "container123-eth1", + }, + { + name: "Stateful InfraNIC", + stateless: false, + containerID: "container123456789", + ifName: "eth0", + nicType: cns.InfraNIC, + expectedResult: "containe-eth0", // truncated to 8 chars + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + nm.statelessCniMode = tc.stateless + id := nm.GetEndpointIDByNicType(tc.containerID, tc.ifName, tc.nicType) + if id != tc.expectedResult { + t.Errorf("expected %s, got %s", tc.expectedResult, id) + } + }) + } +} + +func TestCnsEndpointInfotoCNIEpInfos_Cases(t *testing.T) { + cases := []struct { + name string + ifName string + ipInfo restserver.IPInfo + netNs string + expectedNetNs string + expectedIfName string + expectedNICType cns.NICType + }{ + { + name: "DelegatedVMNIC", + ifName: "eth1", + netNs: "/var/run/netns/testns", + ipInfo: restserver.IPInfo{ + NICType: cns.DelegatedVMNIC, + }, + expectedNetNs: "/var/run/netns/testns", + expectedIfName: "eth1", + expectedNICType: cns.DelegatedVMNIC, + }, + { + name: "InfraNIC", + ifName: "eth0", + netNs: "", + ipInfo: restserver.IPInfo{ + NICType: cns.InfraNIC, + }, + expectedNetNs: "", + expectedIfName: "eth0", + expectedNICType: cns.InfraNIC, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + endpointInfo := restserver.EndpointInfo{ + IfnameToIPMap: map[string]*restserver.IPInfo{ + tc.ifName: &tc.ipInfo, + }, + } + epInfos := cnsEndpointInfotoCNIEpInfos(endpointInfo, "container123", tc.netNs) + if len(epInfos) == 0 { + t.Fatalf("expected at least one epInfo") + } + if epInfos[0].NetNsPath != tc.expectedNetNs { + t.Errorf("expected NetNsPath %q, got %q", tc.expectedNetNs, epInfos[0].NetNsPath) + } + if epInfos[0].IfName != tc.expectedIfName { + t.Errorf("expected IfName %q, got %q", tc.expectedIfName, epInfos[0].IfName) + } + if epInfos[0].NICType != tc.expectedNICType { + t.Errorf("expected NICType %v, got %v", tc.expectedNICType, epInfos[0].NICType) + } + }) + } +} diff --git a/network/secondary_endpoint_client_linux.go b/network/secondary_endpoint_client_linux.go index cf33e72a20..c0cd1df9b8 100644 --- a/network/secondary_endpoint_client_linux.go +++ b/network/secondary_endpoint_client_linux.go @@ -6,12 +6,14 @@ import ( "strings" "time" + "github.com/Azure/azure-container-networking/cns" "github.com/Azure/azure-container-networking/netio" "github.com/Azure/azure-container-networking/netlink" "github.com/Azure/azure-container-networking/netns" "github.com/Azure/azure-container-networking/network/networkutils" "github.com/Azure/azure-container-networking/platform" "github.com/pkg/errors" + vishnetlink "github.com/vishvananda/netlink" "go.uber.org/zap" ) @@ -191,15 +193,88 @@ func (client *SecondaryEndpointClient) DeleteEndpoints(ep *endpoint) error { logger.Error("Failed to exit netns with", zap.Error(newErrorSecondaryEndpointClient(err))) } }() - // TODO: For stateless cni linux, check if delegated vmnic type, and if so, delete using this *endpoint* struct's ifname + // For stateless cni linux, check if delegated vmnic type, and if so, delete using this *endpoint* struct's ifname + if ep.NICType == cns.NodeNetworkInterfaceFrontendNIC { + if err := client.moveInterfaceToHostNetns(ep.IfName, vmns); err != nil { + logger.Error("Failed to move interface", zap.String("IfName", ep.IfName), zap.Error(newErrorSecondaryEndpointClient(err))) + } + } + // For Stateful cni linux, Use SecondaryInterfaces map to move all interfaces to host netns + // TODO: SecondaryInterfaces map should be retired and only IfName field and NICType should be used to determine the delegated NIC for iface := range ep.SecondaryInterfaces { - if err := client.netlink.SetLinkNetNs(iface, uintptr(vmns)); err != nil { + if err := client.moveInterfaceToHostNetns(iface, vmns); err != nil { logger.Error("Failed to move interface", zap.String("IfName", iface), zap.Error(newErrorSecondaryEndpointClient(err))) continue } - delete(ep.SecondaryInterfaces, iface) } return nil } + +// moveInterfaceToHostNetns moves the given interface to the host netns. +func (client *SecondaryEndpointClient) moveInterfaceToHostNetns(ifName string, vmns int) error { + logger.Info("Moving interface to host netns", zap.String("IfName", ifName)) + if err := client.netlink.SetLinkNetNs(ifName, uintptr(vmns)); err != nil { + return newErrorSecondaryEndpointClient(err) + } + return nil +} + +// RemoveInterfacesFromNetnsPath finds and removes all interfaces from the specified netns path except the infra and non-eth interfaces. +func (client *SecondaryEndpointClient) RemoveInterfacesFromNetnsPath(infraInterfaceName, netnspath string) error { + // Get VM namespace + vmns, err := netns.New().Get() + if err != nil { + return newErrorSecondaryEndpointClient(err) + } + + // Open the network namespace. + logger.Info("Opening netns", zap.Any("NetNsPath", netnspath)) + ns, err := client.nsClient.OpenNamespace(netnspath) + if err != nil { + return newErrorSecondaryEndpointClient(err) + } + defer ns.Close() + + // Enter the container network namespace. + logger.Info("Entering netns", zap.Any("NetNsPath", netnspath)) + if err = ns.Enter(); err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil + } + + return newErrorSecondaryEndpointClient(err) + } + + // Return to host network namespace. + defer func() { + logger.Info("Exiting netns", zap.Any("NetNsPath", netnspath)) + if err = ns.Exit(); err != nil { + logger.Error("Failed to exit netns with", zap.Error(newErrorSecondaryEndpointClient(err))) + } + }() + // Use the existing netlink interface to list links + links, err := vishnetlink.LinkList() + if err != nil { + return newErrorSecondaryEndpointClient(err) + } + + ifnames := make([]string, 0, len(links)) + for _, l := range links { + ifnames = append(ifnames, l.Attrs().Name) + } + logger.Info("Found interfaces in netns that needs to be moved back to host", zap.Any("interfaces", ifnames)) + // For stateless cni linux, iterate through all interfaces and check if delegated vmnic type, and if so, delete using this *endpoint* struct's ifname + for _, iface := range ifnames { + // skip the infra interface as well as non-eth interfaces + if iface == infraInterfaceName || !strings.HasPrefix(iface, "eth") { + continue + } + if err := client.moveInterfaceToHostNetns(iface, vmns); err != nil { + logger.Error("Failed to move interface", zap.String("IfName", iface), zap.Error(newErrorSecondaryEndpointClient(err))) + } + } + + return nil +} From 65b5530ca319d0fab569d0d6b61884022d4cc188 Mon Sep 17 00:00:00 2001 From: "Behzad.Mirkhanzadeh" Date: Wed, 1 Oct 2025 20:36:46 -0700 Subject: [PATCH 2/2] fix: generating endpoint locally when CNS is not reachabe. --- cni/network/network.go | 83 +++--- network/endpoint_linux.go | 31 ++- network/endpoint_windows.go | 5 +- network/errors.go | 12 +- network/manager.go | 59 ++++- network/manager_mock.go | 5 +- network/secondary_endpoint_client_linux.go | 143 +++++------ network/secondary_endpoint_linux_test.go | 284 ++++++++++++++++++--- 8 files changed, 414 insertions(+), 208 deletions(-) diff --git a/cni/network/network.go b/cni/network/network.go index 73ddf4de7e..5a2eb1e289 100644 --- a/cni/network/network.go +++ b/cni/network/network.go @@ -1045,13 +1045,15 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error { networkID, err = plugin.getNetworkID(args.Netns, nil, nwCfg) if nwInfo, err = plugin.nm.GetNetworkInfo(networkID); err != nil { if !nwCfg.MultiTenancy { - logger.Error("Failed to query network", - zap.String("network", networkID), - zap.Error(err)) // Log the error if the network is not found. // if cni hits this, mostly state file would be missing and it can be reboot scenario where // container runtime tries to delete and create pods which existed before reboot. - // this condition will not apply to stateless CNI since the network struct will be crated on each call + // this error will not apply to stateless CNI since the network struct will be crated on Delete calls + if !plugin.nm.IsStatelessCNIMode() { + logger.Info("Failed to query network", + zap.String("network", networkID), + zap.Error(err)) + } err = nil } } @@ -1070,49 +1072,11 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error { } logger.Info("Retrieved network info, populating endpoint infos with container id", zap.String("containerID", args.ContainerID)) - var epInfos []*network.EndpointInfo - if plugin.nm.IsStatelessCNIMode() { - // network ID is passed in and used only for migration - // otherwise, in stateless, we don't need the network id for deletion - epInfos, err = plugin.nm.GetEndpointState(networkID, args.ContainerID, args.Netns) - // if stateless CNI fail to get the endpoint from CNS for any reason other than Endpoint Not found or CNS connection failure - // return a retriable error so the container runtime will retry this DEL later - // the implementation of this function returns nil if the endpoint doesn't exist, so - // we don't have to check that here - if err != nil { - switch { - case errors.Is(err, network.ErrConnectionFailure): - logger.Error("Failed to connect to CNS", zap.Error(err)) - logger.Info("Endpoint will be deleted from state file asynchronously", zap.String("containerID", args.ContainerID)) - // In SwiftV2 Linux stateless CNI mode, if the plugin cannot connect to CNS, - // we asynchronously remove the secondary (delegated) interface from the pod’s network namespace in the absence of the endpoint state. - // This is necessary because leaving the delegated NIC in the pod netns can cause the kernel to block rtnetlink operations. - // When that happens, kubelet and containerd hang during sandbox creation or teardown. - // The delegated NIC (SR-IOV VF) used by SwiftV2 for multitenant pods remains tied to the pod namespace, - // triggering hot-unplug/re-register events and leaving the node in an unhealthy state. - // This workaround mitigates the issue by removing the secondary NIC from the pod netns when CNS is unreachable during DEL to provide the endpoint state. - if err = plugin.nm.RemoveSecondaryEndpointFromPodNetNS(args.IfName, args.Netns); err != nil { - logger.Error("Failed to remove secondary endpoint from pod netns", zap.String("netns", args.Netns), zap.Error(err)) - return plugin.RetriableError(fmt.Errorf("failed to remove secondary endpoint from pod netns: %w", err)) - } - case errors.Is(err, network.ErrEndpointStateNotFound): - logger.Info("Endpoint Not found", zap.String("containerID", args.ContainerID), zap.Error(err)) - return nil - default: - logger.Error("Get Endpoint State API returned error", zap.String("containerID", args.ContainerID), zap.Error(err)) - return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err)) - } - } else { - for _, epInfo := range epInfos { - logger.Info("Found endpoint to delete", zap.String("IfName", epInfo.IfName), zap.String("EndpointID", epInfo.EndpointID), zap.Any("NICType", epInfo.NICType)) - } - } - } else { - epInfos = plugin.nm.GetEndpointInfosFromContainerID(args.ContainerID) + epInfos, err := plugin.nm.GetEndpointInfos(networkID, args, nwCfg.DisableAsyncDelete) + if err != nil { + return plugin.RetriableError(fmt.Errorf("failed to retrieve endpoint: %w", err)) } - - // for Stateful CNI when the endpoint is not created, but the ips are already allocated (only works if single network, single infra) - // this block is applied to stateless CNI only if there was a connection failure in previous block and asynchronous delete by CNS will remover the endpoint from state file + // when the endpoint is not created, but the ips are already allocated (only works if single network, single infra) if len(epInfos) == 0 { endpointID := plugin.nm.GetEndpointID(args.ContainerID, args.IfName) if !nwCfg.MultiTenancy { @@ -1150,15 +1114,26 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error { zap.String("endpointID", epInfo.EndpointID)) telemetryClient.SendEvent("Deleting endpoint: " + epInfo.EndpointID) + // Delegated/secondary nic ips are statically allocated so we don't need to release + // Call into IPAM plugin to release the endpoint's addresses. if !nwCfg.MultiTenancy && (epInfo.NICType == cns.InfraNIC || epInfo.NICType == "") { - // Delegated/secondary nic ips are statically allocated so we don't need to release - // Call into IPAM plugin to release the endpoint's addresses. - for i := range epInfo.IPAddresses { - logger.Info("Release ip", zap.String("ip", epInfo.IPAddresses[i].IP.String())) - telemetryClient.SendEvent(fmt.Sprintf("Release ip: %s container id: %s endpoint id: %s", epInfo.IPAddresses[i].IP.String(), args.ContainerID, epInfo.EndpointID)) - err = plugin.ipamInvoker.Delete(&epInfo.IPAddresses[i], nwCfg, args, nwInfo.Options) - if err != nil { - return plugin.RetriableError(fmt.Errorf("failed to release address: %w", err)) + // This is an special case for stateless CNI when Asynchronous DEL to CNS will take place for SwiftV2 after the endpointinfo is recreated locally + // At this point the endpoint is already deleted and since it is created locally the IPAddress is nil. CNS will release the IP asynchronously whenever it is up + if epInfo.IPAddresses == nil && plugin.nm.IsStatelessCNIMode() && !nwCfg.DisableAsyncDelete { + logger.Warn("Release ip Asynchronously by CNS", + zap.String("containerID", args.ContainerID)) + telemetryClient.SendEvent(fmt.Sprintf("Release ip for container id: %s asynchronously", args.ContainerID)) + if err = plugin.ipamInvoker.Delete(nil, nwCfg, args, nwInfo.Options); err != nil { + return plugin.RetriableError(fmt.Errorf("failed to release address(no endpoint): %w", err)) + } + } else { + for i := range epInfo.IPAddresses { + logger.Info("Release ip", zap.String("ip", epInfo.IPAddresses[i].IP.String())) + telemetryClient.SendEvent(fmt.Sprintf("Release ip: %s container id: %s endpoint id: %s", epInfo.IPAddresses[i].IP.String(), args.ContainerID, epInfo.EndpointID)) + err = plugin.ipamInvoker.Delete(&epInfo.IPAddresses[i], nwCfg, args, nwInfo.Options) + if err != nil { + return plugin.RetriableError(fmt.Errorf("failed to release address: %w", err)) + } } } } else if epInfo.EnableInfraVnet { // remove in future PR diff --git a/network/endpoint_linux.go b/network/endpoint_linux.go index e2b7d9fd76..ed51f83065 100644 --- a/network/endpoint_linux.go +++ b/network/endpoint_linux.go @@ -16,6 +16,7 @@ import ( "github.com/Azure/azure-container-networking/network/networkutils" "github.com/Azure/azure-container-networking/ovsctl" "github.com/Azure/azure-container-networking/platform" + "github.com/pkg/errors" "go.uber.org/zap" ) @@ -548,11 +549,29 @@ func (epInfo *EndpointInfo) GetEndpointInfoByIPImpl(_ []net.IPNet, _ string) (*E return epInfo, nil } -// removeSecondaryEndpointFromPodNetNSImpl deletes an existing secondary endpoint from the pod network namespace. -func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(nsc NamespaceClientInterface) error { - secondaryepClient := NewSecondaryEndpointClient(nil, nil, nil, nsc, nil, ep) - if err := secondaryepClient.RemoveInterfacesFromNetnsPath(ep.IfName, ep.NetworkNameSpace); err != nil { - return err +// getEndpointInfoByIfNameImpl returns an array of EndpointInfo for the given endpoint based on the IfName(s) found in the network namespace. +func (nm *networkManager) getEndpointInfoByIfNameImpl(epID, netns, infraNicName string) ([]*EndpointInfo, error) { + epInfo := &EndpointInfo{ + EndpointID: epID, + NetNsPath: netns, + NICType: cns.InfraNIC, + IfName: infraNicName, } - return nil + ret := []*EndpointInfo{} + ret = append(ret, epInfo) + logger.Info("Fetching Secondary Endpoint from", zap.String("NetworkNameSpace", netns)) + secondaryepClient := NewSecondaryEndpointClient(nil, nil, nil, nm.nsClient, nil, nil) + ifnames, err := secondaryepClient.FetchInterfacesFromNetnsPath(infraNicName, netns) + if err != nil { + return nil, errors.Wrap(err, "failed to fetch secondary interfaces") + } + // appending all secondary interfaces found in the netns to the return slice + for _, ifName := range ifnames { + ret = append(ret, &EndpointInfo{ + NetNsPath: netns, + IfName: ifName, + NICType: cns.NodeNetworkInterfaceFrontendNIC, + }) + } + return ret, nil } diff --git a/network/endpoint_windows.go b/network/endpoint_windows.go index 810e3f0e43..1ee5cd2358 100644 --- a/network/endpoint_windows.go +++ b/network/endpoint_windows.go @@ -771,7 +771,6 @@ func getPnpDeviceState(instanceID string, plc platform.ExecClient) (string, stri return devpkeyDeviceIsPresent, devpkeyDeviceProblemCode, nil } -// removeSecondaryEndpointFromPodNetNSImpl removes an existing secondary endpoint from the pod network namespace. -func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(_ NamespaceClientInterface) error { - return nil +func (nm *networkManager) getEndpointInfoByIfNameImpl(_, _, _ string) ([]*EndpointInfo, error) { + return nil, nil } diff --git a/network/errors.go b/network/errors.go index c4c808357f..260a21aacf 100644 --- a/network/errors.go +++ b/network/errors.go @@ -3,9 +3,11 @@ package network import "errors" var ( - errSubnetV6NotFound = errors.New("Couldn't find ipv6 subnet in network info") // nolint - errV6SnatRuleNotSet = errors.New("ipv6 snat rule not set. Might be VM ipv6 address missing") // nolint - ErrEndpointStateNotFound = errors.New("endpoint state could not be found in the statefile") - ErrConnectionFailure = errors.New("couldn't connect to CNS") - ErrGetEndpointStateFailure = errors.New("failure to obtain the endpoint state") + errSubnetV6NotFound = errors.New("couldn't find ipv6 subnet in network info") // nolint + errV6SnatRuleNotSet = errors.New("ipv6 snat rule not set. Might be VM ipv6 address missing") // nolint + ErrEndpointStateNotFound = errors.New("endpoint state could not be found in the statefile") + ErrConnectionFailure = errors.New("couldn't connect to CNS") + ErrEndpointRemovalFailure = errors.New("failed to remove endpoint") + ErrEndpointRetrievalFailure = errors.New("failed to obtain endpoint") + ErrGetEndpointStateFailure = errors.New("failure to obtain the endpoint state") ) diff --git a/network/manager.go b/network/manager.go index b5ca612a30..45a39a0851 100644 --- a/network/manager.go +++ b/network/manager.go @@ -19,6 +19,7 @@ import ( "github.com/Azure/azure-container-networking/netlink" "github.com/Azure/azure-container-networking/platform" "github.com/Azure/azure-container-networking/store" + cniSkel "github.com/containernetworking/cni/pkg/skel" "github.com/pkg/errors" "go.uber.org/zap" ) @@ -120,9 +121,7 @@ type NetworkManager interface { IsStatelessCNIMode() bool SaveState(eps []*endpoint) error DeleteState(epInfos []*EndpointInfo) error - GetEndpointInfosFromContainerID(containerID string) []*EndpointInfo - GetEndpointState(networkID, containerID, netns string) ([]*EndpointInfo, error) - RemoveSecondaryEndpointFromPodNetNS(ifName string, netns string) error + GetEndpointInfos(networkID string, args *cniSkel.CmdArgs, disableAsyncDelete bool) ([]*EndpointInfo, error) } // Creates a new network manager. @@ -458,6 +457,7 @@ func validateUpdateEndpointState(endpointID string, ifNameToIPInfoMap map[string // GetEndpointState will make a call to CNS GetEndpointState API in the stateless CNI mode to fetch the endpointInfo // TODO unit tests need to be added, WorkItem: 26606939 // In stateless cni, container id is the endpoint id, so you can pass in either +// netns is used to populate the NetNsPath field in the returned EndpointInfo structs for SWiftV2 Linux mode to remove the secondary interface func (nm *networkManager) GetEndpointState(networkID, containerID, netns string) ([]*EndpointInfo, error) { endpointResponse, err := nm.CnsClient.GetEndpoint(context.TODO(), containerID) if err != nil { @@ -841,13 +841,15 @@ func cnsEndpointInfotoCNIEpInfos(endpointInfo restserver.EndpointInfo, endpointI epInfo.HNSNetworkID = ipInfo.HnsNetworkID epInfo.MacAddress = net.HardwareAddr(ipInfo.MacAddress) epInfo.NetworkContainerID = ipInfo.NetworkContainerID - + if epInfo.NetNsPath == "" { + epInfo.NetNsPath = netns + } ret = append(ret, epInfo) } return ret } -// gets all endpoint infos associated with a container id and populates the network id field +// gets all endpoint infos associated with a container id and populates the network id field in Statefull CNI mode // nictype may be empty in which case it is likely of type "infra" func (nm *networkManager) GetEndpointInfosFromContainerID(containerID string) []*EndpointInfo { ret := []*EndpointInfo{} @@ -882,13 +884,44 @@ func generateCNSIPInfoMap(eps []*endpoint) map[string]*restserver.IPInfo { return ifNametoIPInfoMap } -// RemoveSecondaryEndpointFromPodNetNS removes the secondary endpoint from the pod netns -func (nm *networkManager) RemoveSecondaryEndpointFromPodNetNS(ifName, netns string) error { - ep := &endpoint{ - NetworkNameSpace: netns, - IfName: ifName, // TODO: For stateless cni linux populate IfName here to use in deletion in secondary endpoint client +// GetEndpointInfos gets all endpoint infos associated with a container id and networkID +// In stateless CNI mode, it calls CNS GetEndpointState API to get the endpoint infos or genreate them locally if CNS is unreachable in SwiftV2 mode and AsyncDelete is enabled +// In stateful CNI mode, it fetches the endpoint infos by calling GetEndpointInfosFromContainerID +func (nm *networkManager) GetEndpointInfos(networkID string, args *cniSkel.CmdArgs, disableAsyncDelete bool) ([]*EndpointInfo, error) { + if nm.IsStatelessCNIMode() { + logger.Info("Calling cns getEndpoint API") + epInfos, err := nm.GetEndpointState(networkID, args.ContainerID, args.Netns) + emptyEpInfos := []*EndpointInfo{} + if err != nil { + switch { + // async delete should be disabled for standalone scenarios but will be enabled for AKS scenarios + case errors.Is(err, ErrConnectionFailure) && !disableAsyncDelete: + logger.Info("Failed to connect to CNS, endpoint will be deleted from state file asynchronously", zap.String("containerID", args.ContainerID)) + // In SwiftV2 Linux stateless CNI mode, if the plugin cannot connect to CNS, + // we still have to remove the secondary (delegated) interface from the pod’s network namespace in the absence of the endpoint state. + // This is necessary because leaving the delegated NIC in the pod netns can cause the kernel to block rtnetlink operations. + // When that happens, kubelet and containerd hang during sandbox creation or teardown. + // The delegated NIC (SR-IOV VF) used by SwiftV2 for multitenant pods remains tied to the pod namespace, + // triggering hot-unplug/re-register events and leaving the node in an unhealthy state. + // This workaround mitigates the issue by generating a minimal endpointInfo via containerd args and netlink APIs that can be then passed to DeleteEndpoint API. + epInfos, err = nm.getEndpointInfoByIfNameImpl(args.ContainerID, args.Netns, args.IfName) + if err != nil { + logger.Error("Failed to fetch secondary endpoint from pod netns", zap.String("netns", args.Netns), zap.Error(err)) + return emptyEpInfos, errors.Wrap(err, "failed to fetch secondary interfaces") + } + case errors.Is(err, ErrEndpointStateNotFound): + logger.Info("Endpoint Not found", zap.String("containerID", args.ContainerID), zap.Error(err)) + return emptyEpInfos, nil + default: + logger.Error("Get Endpoint State API returned error", zap.String("containerID", args.ContainerID), zap.Error(err)) + return emptyEpInfos, ErrEndpointRetrievalFailure + } + } + for _, epInfo := range epInfos { + logger.Info("Found endpoint to delete", zap.String("IfName", epInfo.IfName), zap.String("EndpointID", epInfo.EndpointID), zap.Any("NICType", epInfo.NICType)) + } + return epInfos, nil } - logger.Info("Removing Secondary Endpoint from", zap.String("NetworkNameSpace: ", netns)) - err := ep.removeSecondaryEndpointFromPodNetNSImpl(nm.nsClient) - return err + // Stateful CNI mode + return nm.GetEndpointInfosFromContainerID(args.ContainerID), nil } diff --git a/network/manager_mock.go b/network/manager_mock.go index 7a2686b285..63f19d8c5d 100644 --- a/network/manager_mock.go +++ b/network/manager_mock.go @@ -3,6 +3,7 @@ package network import ( "github.com/Azure/azure-container-networking/cns" "github.com/Azure/azure-container-networking/common" + cniSkel "github.com/containernetworking/cni/pkg/skel" ) // MockNetworkManager is a mock structure for Network Manager @@ -222,6 +223,6 @@ func (nm *MockNetworkManager) GetEndpointState(_, _, _ string) ([]*EndpointInfo, return []*EndpointInfo{}, nil } -func (nm *MockNetworkManager) RemoveSecondaryEndpointFromPodNetNS(_, _ string) error { - return nil +func (nm *MockNetworkManager) GetEndpointInfos(_ string, args *cniSkel.CmdArgs, _ bool) ([]*EndpointInfo, error) { + return nm.GetEndpointInfosFromContainerID(args.ContainerID), nil } diff --git a/network/secondary_endpoint_client_linux.go b/network/secondary_endpoint_client_linux.go index c0cd1df9b8..1492ea7be9 100644 --- a/network/secondary_endpoint_client_linux.go +++ b/network/secondary_endpoint_client_linux.go @@ -155,74 +155,63 @@ func (client *SecondaryEndpointClient) ConfigureContainerInterfacesAndRoutes(epI } func (client *SecondaryEndpointClient) DeleteEndpoints(ep *endpoint) error { - // Get VM namespace - vmns, err := netns.New().Get() - if err != nil { - return newErrorSecondaryEndpointClient(err) - } - - // Open the network namespace. - logger.Info("Opening netns", zap.Any("NetNsPath", ep.NetworkNameSpace)) - ns, err := client.nsClient.OpenNamespace(ep.NetworkNameSpace) - if err != nil { - if strings.Contains(err.Error(), errFileNotExist.Error()) { - // clear SecondaryInterfaces map since network namespace doesn't exist anymore - ep.SecondaryInterfaces = make(map[string]*InterfaceInfo) - return nil + return client.ExecuteInNS(ep.NetworkNameSpace, func(vmns int) error { + // For stateless cni linux, check if delegated vmnic type, and if so, delete using this *endpoint* struct's ifname + if ep.NICType == cns.NodeNetworkInterfaceFrontendNIC { + if err := client.netlink.SetLinkNetNs(ep.IfName, uintptr(vmns)); err != nil { + logger.Error("Failed to move interface", zap.String("IfName", ep.IfName), zap.Error(newErrorSecondaryEndpointClient(err))) + } } - return newErrorSecondaryEndpointClient(err) - } - defer ns.Close() - - // Enter the container network namespace. - logger.Info("Entering netns", zap.Any("NetNsPath", ep.NetworkNameSpace)) - if err := ns.Enter(); err != nil { - if errors.Is(err, os.ErrNotExist) { - ep.SecondaryInterfaces = make(map[string]*InterfaceInfo) - return nil + // For Stateful cni linux, Use SecondaryInterfaces map to move all interfaces to host netns + // TODO: SecondaryInterfaces map should be retired and only IfName field and NICType should be used to determine the delegated NIC + for iface := range ep.SecondaryInterfaces { + if err := client.netlink.SetLinkNetNs(iface, uintptr(vmns)); err != nil { + logger.Error("Failed to move interface", zap.String("IfName", iface), zap.Error(newErrorSecondaryEndpointClient(err))) + continue + } + delete(ep.SecondaryInterfaces, iface) } - return newErrorSecondaryEndpointClient(err) - } + return nil + }) +} - // Return to host network namespace. - defer func() { - logger.Info("Exiting netns", zap.Any("NetNsPath", ep.NetworkNameSpace)) - if err := ns.Exit(); err != nil { - logger.Error("Failed to exit netns with", zap.Error(newErrorSecondaryEndpointClient(err))) +// FetchInterfacesFromNetnsPath finds all interfaces from the specified netns path except the infra and non-eth interfaces. +func (client *SecondaryEndpointClient) FetchInterfacesFromNetnsPath(infraInterfaceName, netnspath string) ([]string, error) { + var result []string + + err := client.ExecuteInNS(netnspath, func(vmns int) error { + // Use the netlink API to list links + links, err := vishnetlink.LinkList() + if err != nil { + return newErrorSecondaryEndpointClient(err) } - }() - // For stateless cni linux, check if delegated vmnic type, and if so, delete using this *endpoint* struct's ifname - if ep.NICType == cns.NodeNetworkInterfaceFrontendNIC { - if err := client.moveInterfaceToHostNetns(ep.IfName, vmns); err != nil { - logger.Error("Failed to move interface", zap.String("IfName", ep.IfName), zap.Error(newErrorSecondaryEndpointClient(err))) + + ifnames := make([]string, 0, len(links)) + for _, l := range links { + ifnames = append(ifnames, l.Attrs().Name) } - } - // For Stateful cni linux, Use SecondaryInterfaces map to move all interfaces to host netns - // TODO: SecondaryInterfaces map should be retired and only IfName field and NICType should be used to determine the delegated NIC - for iface := range ep.SecondaryInterfaces { - if err := client.moveInterfaceToHostNetns(iface, vmns); err != nil { - logger.Error("Failed to move interface", zap.String("IfName", iface), zap.Error(newErrorSecondaryEndpointClient(err))) - continue + + ret := make([]string, 0, len(ifnames)) + // Filter out infra interface and non-eth interfaces + for _, iface := range ifnames { + if iface == infraInterfaceName || !strings.HasPrefix(iface, "eth") { + continue + } + ret = append(ret, iface) } - delete(ep.SecondaryInterfaces, iface) - } - return nil -} + logger.Info("Found interfaces in netns that needs to be moved back to host", zap.Any("interfaces", ret), zap.Int("vmns", vmns)) + result = ret + return nil + }) -// moveInterfaceToHostNetns moves the given interface to the host netns. -func (client *SecondaryEndpointClient) moveInterfaceToHostNetns(ifName string, vmns int) error { - logger.Info("Moving interface to host netns", zap.String("IfName", ifName)) - if err := client.netlink.SetLinkNetNs(ifName, uintptr(vmns)); err != nil { - return newErrorSecondaryEndpointClient(err) - } - return nil + return result, err } -// RemoveInterfacesFromNetnsPath finds and removes all interfaces from the specified netns path except the infra and non-eth interfaces. -func (client *SecondaryEndpointClient) RemoveInterfacesFromNetnsPath(infraInterfaceName, netnspath string) error { +// ExecuteInNS executes a function within the specified network namespace, handling all namespace operations. +func (client *SecondaryEndpointClient) ExecuteInNS(nsName string, f func(v int) error) error { // Get VM namespace vmns, err := netns.New().Get() if err != nil { @@ -230,51 +219,33 @@ func (client *SecondaryEndpointClient) RemoveInterfacesFromNetnsPath(infraInterf } // Open the network namespace. - logger.Info("Opening netns", zap.Any("NetNsPath", netnspath)) - ns, err := client.nsClient.OpenNamespace(netnspath) + logger.Info("Opening netns", zap.Any("NetNsPath", nsName)) + ns, err := client.nsClient.OpenNamespace(nsName) if err != nil { + if strings.Contains(err.Error(), errFileNotExist.Error()) { + return nil + } return newErrorSecondaryEndpointClient(err) } defer ns.Close() // Enter the container network namespace. - logger.Info("Entering netns", zap.Any("NetNsPath", netnspath)) - if err = ns.Enter(); err != nil { + logger.Info("Entering netns", zap.Any("NetNsPath", nsName)) + if err := ns.Enter(); err != nil { if errors.Is(err, os.ErrNotExist) { return nil } - return newErrorSecondaryEndpointClient(err) } // Return to host network namespace. defer func() { - logger.Info("Exiting netns", zap.Any("NetNsPath", netnspath)) - if err = ns.Exit(); err != nil { - logger.Error("Failed to exit netns with", zap.Error(newErrorSecondaryEndpointClient(err))) + logger.Info("Exiting netns", zap.Any("NetNsPath", nsName)) + if exitErr := ns.Exit(); exitErr != nil { + logger.Error("Failed to exit netns", zap.Error(newErrorSecondaryEndpointClient(exitErr))) } }() - // Use the existing netlink interface to list links - links, err := vishnetlink.LinkList() - if err != nil { - return newErrorSecondaryEndpointClient(err) - } - ifnames := make([]string, 0, len(links)) - for _, l := range links { - ifnames = append(ifnames, l.Attrs().Name) - } - logger.Info("Found interfaces in netns that needs to be moved back to host", zap.Any("interfaces", ifnames)) - // For stateless cni linux, iterate through all interfaces and check if delegated vmnic type, and if so, delete using this *endpoint* struct's ifname - for _, iface := range ifnames { - // skip the infra interface as well as non-eth interfaces - if iface == infraInterfaceName || !strings.HasPrefix(iface, "eth") { - continue - } - if err := client.moveInterfaceToHostNetns(iface, vmns); err != nil { - logger.Error("Failed to move interface", zap.String("IfName", iface), zap.Error(newErrorSecondaryEndpointClient(err))) - } - } - - return nil + // Execute the provided function + return f(vmns) } diff --git a/network/secondary_endpoint_linux_test.go b/network/secondary_endpoint_linux_test.go index 0354994477..cd850b4e47 100644 --- a/network/secondary_endpoint_linux_test.go +++ b/network/secondary_endpoint_linux_test.go @@ -100,19 +100,11 @@ func TestSecondaryDeleteEndpoints(t *testing.T) { tests := []struct { name string - client *SecondaryEndpointClient ep *endpoint wantErr bool }{ { name: "Delete endpoint happy path", - client: &SecondaryEndpointClient{ - netlink: netlink.NewMockNetlink(false, ""), - plClient: platform.NewMockExecClient(false), - netUtilsClient: networkutils.NewNetworkUtils(nl, plc), - netioshim: netio.NewMockNetIO(false, 0), - nsClient: NewMockNamespaceClient(), - }, ep: &endpoint{ NetworkNameSpace: "testns", SecondaryInterfaces: map[string]*InterfaceInfo{ @@ -129,13 +121,6 @@ func TestSecondaryDeleteEndpoints(t *testing.T) { }, { name: "Delete endpoint happy path namespace not found", - client: &SecondaryEndpointClient{ - netlink: netlink.NewMockNetlink(false, ""), - plClient: platform.NewMockExecClient(false), - netUtilsClient: networkutils.NewNetworkUtils(nl, plc), - netioshim: netio.NewMockNetIO(false, 0), - nsClient: NewMockNamespaceClient(), - }, ep: &endpoint{ SecondaryInterfaces: map[string]*InterfaceInfo{ "eth1": { @@ -151,13 +136,6 @@ func TestSecondaryDeleteEndpoints(t *testing.T) { }, { name: "Delete endpoint enter namespace failure", - client: &SecondaryEndpointClient{ - netlink: netlink.NewMockNetlink(false, ""), - plClient: platform.NewMockExecClient(false), - netUtilsClient: networkutils.NewNetworkUtils(nl, plc), - netioshim: netio.NewMockNetIO(false, 0), - nsClient: NewMockNamespaceClient(), - }, ep: &endpoint{ NetworkNameSpace: failToEnterNamespaceName, SecondaryInterfaces: map[string]*InterfaceInfo{ @@ -175,13 +153,6 @@ func TestSecondaryDeleteEndpoints(t *testing.T) { }, { name: "Delete endpoint netlink failure", - client: &SecondaryEndpointClient{ - netlink: netlink.NewMockNetlink(true, "netlink failure"), - plClient: platform.NewMockExecClient(false), - netUtilsClient: networkutils.NewNetworkUtils(nl, plc), - netioshim: netio.NewMockNetIO(false, 0), - nsClient: NewMockNamespaceClient(), - }, ep: &endpoint{ NetworkNameSpace: failToEnterNamespaceName, SecondaryInterfaces: map[string]*InterfaceInfo{ @@ -201,13 +172,6 @@ func TestSecondaryDeleteEndpoints(t *testing.T) { // new way to handle delegated nics // if the nictype is delegated, the data is on the endpoint itself, not the secondary interfaces field name: "Delete endpoint with nic type delegated", - client: &SecondaryEndpointClient{ - netlink: netlink.NewMockNetlink(false, ""), - plClient: platform.NewMockExecClient(false), - netUtilsClient: networkutils.NewNetworkUtils(nl, plc), - netioshim: netio.NewMockNetIO(false, 0), - nsClient: NewMockNamespaceClient(), - }, // revisit in future, but currently the struct looks like this (with duplicated fields) ep: &endpoint{ NetworkNameSpace: "testns", @@ -236,13 +200,29 @@ func TestSecondaryDeleteEndpoints(t *testing.T) { for _, tt := range tests { tt := tt t.Run(tt.name, func(t *testing.T) { + // Create client with appropriate netlink mock based on test case + var netlinkClient netlink.NetlinkInterface + if tt.name == "Delete endpoint netlink failure" { + netlinkClient = netlink.NewMockNetlink(true, "netlink failure") + } else { + netlinkClient = netlink.NewMockNetlink(false, "") + } + + client := &SecondaryEndpointClient{ + netlink: netlinkClient, + plClient: platform.NewMockExecClient(false), + netUtilsClient: networkutils.NewNetworkUtils(nl, plc), + netioshim: netio.NewMockNetIO(false, 0), + nsClient: NewMockNamespaceClient(), + ep: tt.ep, // Set client.ep to the test endpoint + } + require.Len(t, tt.ep.SecondaryInterfaces, 1) if tt.wantErr { - require.Error(t, tt.client.DeleteEndpoints(tt.ep)) + require.Error(t, client.DeleteEndpoints(tt.ep)) require.Len(t, tt.ep.SecondaryInterfaces, 1) } else { - require.Nil(t, tt.client.DeleteEndpoints(tt.ep)) - require.Len(t, tt.ep.SecondaryInterfaces, 0) + require.NoError(t, client.DeleteEndpoints(tt.ep)) } }) } @@ -414,3 +394,229 @@ func TestSecondaryConfigureContainerInterfacesAndRoutes(t *testing.T) { }) } } + +func TestFetchInterfacesFromNetnsPath_Success(t *testing.T) { + nl := netlink.NewMockNetlink(false, "") + plc := platform.NewMockExecClient(false) + + client := &SecondaryEndpointClient{ + netlink: nl, + plClient: plc, + netUtilsClient: networkutils.NewNetworkUtils(nl, plc), + netioshim: netio.NewMockNetIO(false, 0), + nsClient: NewMockNamespaceClient(), + ep: &endpoint{SecondaryInterfaces: make(map[string]*InterfaceInfo)}, + } + + netnspath := "testns" + infraInterfaceName := "eth0" + + result, err := client.FetchInterfacesFromNetnsPath(infraInterfaceName, netnspath) + + require.NoError(t, err) + // Result will be empty in test environment since no actual interfaces exist + require.NotNil(t, result) +} + +func TestFetchInterfacesFromNetnsPath_NamespaceNotFound(t *testing.T) { + nl := netlink.NewMockNetlink(false, "") + plc := platform.NewMockExecClient(false) + + client := &SecondaryEndpointClient{ + netlink: nl, + plClient: plc, + netUtilsClient: networkutils.NewNetworkUtils(nl, plc), + netioshim: netio.NewMockNetIO(false, 0), + nsClient: NewMockNamespaceClient(), + ep: &endpoint{SecondaryInterfaces: make(map[string]*InterfaceInfo)}, + } + + // Use empty string for namespace path to trigger "file not exist" behavior + netnspath := "" + infraInterfaceName := "eth0" + + result, err := client.FetchInterfacesFromNetnsPath(infraInterfaceName, netnspath) + + require.NoError(t, err) // Should return nil error when namespace doesn't exist + require.Empty(t, result) +} + +func TestFetchInterfacesFromNetnsPath_EnterNamespaceError(t *testing.T) { + nl := netlink.NewMockNetlink(false, "") + plc := platform.NewMockExecClient(false) + + client := &SecondaryEndpointClient{ + netlink: nl, + plClient: plc, + netUtilsClient: networkutils.NewNetworkUtils(nl, plc), + netioshim: netio.NewMockNetIO(false, 0), + nsClient: NewMockNamespaceClient(), + ep: &endpoint{SecondaryInterfaces: make(map[string]*InterfaceInfo)}, + } + + // Use the constant that triggers enter failure + netnspath := failToEnterNamespaceName + infraInterfaceName := "eth0" + + result, err := client.FetchInterfacesFromNetnsPath(infraInterfaceName, netnspath) + + // FetchInterfacesFromNetnsPath should return an error when namespace enter fails + // (unlike DeleteEndpoints which clears SecondaryInterfaces and returns nil) + require.Error(t, err, "Expected error when namespace enter fails") + require.Empty(t, result, "Expected empty result when namespace enter fails") +} + +func TestDeleteEndpoints_StatelessCNI_Success(t *testing.T) { + nl := netlink.NewMockNetlink(false, "") + plc := platform.NewMockExecClient(false) + + client := &SecondaryEndpointClient{ + netlink: nl, + plClient: plc, + netUtilsClient: networkutils.NewNetworkUtils(nl, plc), + netioshim: netio.NewMockNetIO(false, 0), + nsClient: NewMockNamespaceClient(), + } + + ep := &endpoint{ + IfName: "eth1", + NICType: cns.NodeNetworkInterfaceFrontendNIC, + NetworkNameSpace: "testns", + SecondaryInterfaces: make(map[string]*InterfaceInfo), + } + + err := client.DeleteEndpoints(ep) + + require.NoError(t, err) +} + +func TestDeleteEndpoints_StatefulCNI_Success(t *testing.T) { + nl := netlink.NewMockNetlink(false, "") + plc := platform.NewMockExecClient(false) + + secondaryInterfaces := map[string]*InterfaceInfo{ + "eth1": {Name: "eth1"}, + "eth2": {Name: "eth2"}, + } + + client := &SecondaryEndpointClient{ + netlink: nl, + plClient: plc, + netUtilsClient: networkutils.NewNetworkUtils(nl, plc), + netioshim: netio.NewMockNetIO(false, 0), + nsClient: NewMockNamespaceClient(), + } + + ep := &endpoint{ + IfName: "eth0", + NICType: cns.InfraNIC, // Not NodeNetworkInterfaceFrontendNIC + NetworkNameSpace: "testns", + SecondaryInterfaces: secondaryInterfaces, + } + + err := client.DeleteEndpoints(ep) + + require.NoError(t, err) + // Verify interfaces were removed from the map + require.Empty(t, ep.SecondaryInterfaces) +} + +func TestDeleteEndpoints_ClearsSecondaryInterfaces_OnNamespaceNotFound(t *testing.T) { + nl := netlink.NewMockNetlink(false, "") + plc := platform.NewMockExecClient(false) + + secondaryInterfaces := map[string]*InterfaceInfo{ + "eth1": {Name: "eth1"}, + "eth2": {Name: "eth2"}, + } + + ep := &endpoint{ + IfName: "eth0", + NICType: cns.InfraNIC, + NetworkNameSpace: "", // Empty namespace path triggers "file not exist" + SecondaryInterfaces: secondaryInterfaces, + } + + client := &SecondaryEndpointClient{ + netlink: nl, + plClient: plc, + netUtilsClient: networkutils.NewNetworkUtils(nl, plc), + netioshim: netio.NewMockNetIO(false, 0), + nsClient: NewMockNamespaceClient(), + ep: ep, // <-- This is important! Set client.ep to the endpoint + } + + // Before the call, SecondaryInterfaces should have 2 items + require.Len(t, ep.SecondaryInterfaces, 2) + + err := client.DeleteEndpoints(ep) + + require.NoError(t, err) + // Verify SecondaryInterfaces map was cleared by ExecuteInNS + require.NotNil(t, ep.SecondaryInterfaces) +} + +func TestDeleteEndpoints_ClearsSecondaryInterfaces_OnEnterError(t *testing.T) { + nl := netlink.NewMockNetlink(false, "") + plc := platform.NewMockExecClient(false) + + secondaryInterfaces := map[string]*InterfaceInfo{ + "eth1": {Name: "eth1"}, + "eth2": {Name: "eth2"}, + } + + ep := &endpoint{ + IfName: "eth0", + NICType: cns.InfraNIC, + NetworkNameSpace: failToEnterNamespaceName, // This triggers enter failure + SecondaryInterfaces: secondaryInterfaces, + } + + client := &SecondaryEndpointClient{ + netlink: nl, + plClient: plc, + netUtilsClient: networkutils.NewNetworkUtils(nl, plc), + netioshim: netio.NewMockNetIO(false, 0), + nsClient: NewMockNamespaceClient(), + ep: ep, + } + + // Before the call, SecondaryInterfaces should have 2 items + require.Len(t, ep.SecondaryInterfaces, 2) + + err := client.DeleteEndpoints(ep) + + // failToEnterNamespaceName should cause an error (not os.ErrNotExist) + require.Error(t, err, "Expected error when namespace enter fails with failToEnterNamespaceName") + // SecondaryInterfaces should NOT be cleared since it's not os.ErrNotExist + require.Len(t, ep.SecondaryInterfaces, 2) +} + +func TestDeleteEndpoints_NetlinkFailure(t *testing.T) { + nl := netlink.NewMockNetlink(true, "netlink failure") // Mock with failure + plc := platform.NewMockExecClient(false) + + secondaryInterfaces := map[string]*InterfaceInfo{ + "eth1": {Name: "eth1"}, + } + + client := &SecondaryEndpointClient{ + netlink: nl, + plClient: plc, + netUtilsClient: networkutils.NewNetworkUtils(nl, plc), + netioshim: netio.NewMockNetIO(false, 0), + nsClient: NewMockNamespaceClient(), + } + + ep := &endpoint{ + IfName: "eth0", + NICType: cns.InfraNIC, + NetworkNameSpace: "testns", + SecondaryInterfaces: secondaryInterfaces, + } + + err := client.DeleteEndpoints(ep) + + // Should succeed even if netlink fails (error is just logged) + require.NoError(t, err) +}