Skip to content

Commit 3f3e4a5

Browse files
committed
Fix: adding support for SWiftV2 Linux stateless CNI mode
1 parent d66b2f9 commit 3f3e4a5

File tree

7 files changed

+268
-23
lines changed

7 files changed

+268
-23
lines changed

cni/network/network.go

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ import (
1818
"github.com/Azure/azure-container-networking/cni/util"
1919
"github.com/Azure/azure-container-networking/cns"
2020
cnscli "github.com/Azure/azure-container-networking/cns/client"
21-
"github.com/Azure/azure-container-networking/cns/fsnotify"
2221
"github.com/Azure/azure-container-networking/common"
2322
"github.com/Azure/azure-container-networking/dhcp"
2423
"github.com/Azure/azure-container-networking/iptables"
@@ -716,7 +715,7 @@ func (plugin *NetPlugin) createEpInfo(opt *createEpInfoOpt) (*network.EndpointIn
716715
*opt.infraSeen = true
717716
} else {
718717
ifName = "eth" + strconv.Itoa(opt.endpointIndex)
719-
endpointID = plugin.nm.GetEndpointID(opt.args.ContainerID, ifName)
718+
endpointID = plugin.nm.GetEndpointIDByNicType(opt.args.ContainerID, ifName, opt.ifInfo.NICType)
720719
}
721720

722721
endpointInfo := network.EndpointInfo{
@@ -1070,31 +1069,45 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error {
10701069
// network ID is passed in and used only for migration
10711070
// otherwise, in stateless, we don't need the network id for deletion
10721071
epInfos, err = plugin.nm.GetEndpointState(networkID, args.ContainerID)
1073-
// if stateless CNI fail to get the endpoint from CNS for any reason other than Endpoint Not found
1072+
// if stateless CNI fail to get the endpoint from CNS for any reason other than Endpoint Not found or CNS connection failure
1073+
// return a retriable error so the container runtime will retry this DEL later
1074+
// the implementation of this function returns nil if the endpoint doesn't exist, so
1075+
// we don't have to check that here
10741076
if err != nil {
1075-
if errors.Is(err, network.ErrConnectionFailure) {
1076-
logger.Info("failed to connect to CNS", zap.String("containerID", args.ContainerID), zap.Error(err))
1077-
addErr := fsnotify.AddFile(args.ContainerID, args.ContainerID, watcherPath)
1078-
logger.Info("add containerid file for Asynch delete", zap.String("containerID", args.ContainerID), zap.Error(addErr))
1079-
if addErr != nil {
1080-
logger.Error("failed to add file to watcher", zap.String("containerID", args.ContainerID), zap.Error(addErr))
1081-
return errors.Wrap(addErr, fmt.Sprintf("failed to add file to watcher with containerID %s", args.ContainerID))
1077+
switch {
1078+
case errors.Is(err, network.ErrConnectionFailure):
1079+
logger.Error("Failed to connect to CNS", zap.Error(err))
1080+
logger.Info("Endpoint will be deleted from state file asynchronously", zap.String("containerID", args.ContainerID))
1081+
// In SwiftV2 Linux stateless CNI mode, if the plugin cannot connect to CNS,
1082+
// we asynchronously remove the secondary (delegated) interface from the pod’s network namespace in the absence of the endpoint state.
1083+
// This is necessary because leaving the delegated NIC in the pod netns can cause the kernel to block rtnetlink operations.
1084+
// When that happens, kubelet and containerd hang during sandbox creation or teardown.
1085+
// The delegated NIC (SR-IOV VF) used by SwiftV2 for multitenant pods remains tied to the pod namespace,
1086+
// triggering hot-unplug/re-register events and leaving the node in an unhealthy state.
1087+
// This workaround mitigates the issue by removing the secondary NIC from the pod netns when CNS is unreachable during DEL to provide the endpoint state.
1088+
if err = plugin.nm.RemoveSecondaryEndpointFromPodNetNS(args.IfName, args.Netns); err != nil {
1089+
logger.Error("Failed to remove secondary endpoint from pod netns", zap.String("netns", args.Netns), zap.Error(err))
1090+
return plugin.RetriableError(fmt.Errorf("failed to remove secondary endpoint from pod netns: %w", err))
10821091
}
1083-
return nil
1084-
}
1085-
if errors.Is(err, network.ErrEndpointStateNotFound) {
1092+
case errors.Is(err, network.ErrEndpointStateNotFound):
10861093
logger.Info("Endpoint Not found", zap.String("containerID", args.ContainerID), zap.Error(err))
10871094
return nil
1095+
default:
1096+
logger.Error("Get Endpoint State API returned error", zap.String("containerID", args.ContainerID), zap.Error(err))
1097+
return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err))
1098+
}
1099+
} else {
1100+
for i, epInfo := range epInfos {
1101+
logger.Info("Found endpoint to delete", zap.String("IfName", epInfo.IfName), zap.String("EndpointID", epInfo.EndpointID), zap.Any("NICType", epInfo.NICType))
1102+
epInfos[i].NetNsPath = args.Netns // This is needed in SwiftV2 scenario to move DelegatedNIC to host namespace
10881103
}
1089-
logger.Error("Get Endpoint State API returned error", zap.String("containerID", args.ContainerID), zap.Error(err))
1090-
return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err))
10911104
}
10921105
} else {
10931106
epInfos = plugin.nm.GetEndpointInfosFromContainerID(args.ContainerID)
10941107
}
10951108

1096-
// for when the endpoint is not created, but the ips are already allocated (only works if single network, single infra)
1097-
// this block is not applied to stateless CNI
1109+
// for Stateful CNI when the endpoint is not created, but the ips are already allocated (only works if single network, single infra)
1110+
// this block is applied to stateless CNI only if there was a connection failure in previous block and asynchronous delete by CNS will remover the endpoint from state file
10981111
if len(epInfos) == 0 {
10991112
endpointID := plugin.nm.GetEndpointID(args.ContainerID, args.IfName)
11001113
if !nwCfg.MultiTenancy {
@@ -1120,7 +1133,7 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error {
11201133
if err = plugin.nm.DeleteEndpoint(epInfo.NetworkID, epInfo.EndpointID, epInfo); err != nil {
11211134
// An error will not be returned if the endpoint is not found
11221135
// return a retriable error so the container runtime will retry this DEL later
1123-
// the implementation of this function returns nil if the endpoint doens't exist, so
1136+
// the implementation of this function returns nil if the endpoint doesn't exist, so
11241137
// we don't have to check that here
11251138
return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err))
11261139
}

network/endpoint_linux.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,3 +547,12 @@ func getDefaultGateway(routes []RouteInfo) net.IP {
547547
func (epInfo *EndpointInfo) GetEndpointInfoByIPImpl(_ []net.IPNet, _ string) (*EndpointInfo, error) {
548548
return epInfo, nil
549549
}
550+
551+
// removeSecondaryEndpointFromPodNetNSImpl deletes an existing secondary endpoint from the pod network namespace.
552+
func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(nsc NamespaceClientInterface) error {
553+
secondaryepClient := NewSecondaryEndpointClient(nil, nil, nil, nsc, nil, ep)
554+
if err := secondaryepClient.RemoveInterfacesFromNetnsPath(ep.IfName, ep.NetworkNameSpace); err != nil {
555+
return err
556+
}
557+
return nil
558+
}

network/endpoint_windows.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,3 +746,8 @@ func getPnpDeviceState(instanceID string, plc platform.ExecClient) (string, stri
746746
logger.Info("Retrieved device problem code", zap.String("code", devpkeyDeviceProblemCode))
747747
return devpkeyDeviceIsPresent, devpkeyDeviceProblemCode, nil
748748
}
749+
750+
// removeSecondaryEndpointFromPodNetNSImpl removes an existing secondary endpoint from the pod network namespace.
751+
func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(_ NamespaceClientInterface) error {
752+
return nil
753+
}

network/manager.go

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,11 +116,13 @@ type NetworkManager interface {
116116
UpdateEndpoint(networkID string, existingEpInfo *EndpointInfo, targetEpInfo *EndpointInfo) error
117117
GetNumberOfEndpoints(ifName string, networkID string) int
118118
GetEndpointID(containerID, ifName string) string
119+
GetEndpointIDByNicType(containerID, ifName string, nicType cns.NICType) string
119120
IsStatelessCNIMode() bool
120121
SaveState(eps []*endpoint) error
121122
DeleteState(epInfos []*EndpointInfo) error
122123
GetEndpointInfosFromContainerID(containerID string) []*EndpointInfo
123124
GetEndpointState(networkID, containerID string) ([]*EndpointInfo, error)
125+
RemoveSecondaryEndpointFromPodNetNS(ifName string, netns string) error
124126
}
125127

126128
// Creates a new network manager.
@@ -514,7 +516,7 @@ func (nm *networkManager) DeleteEndpointState(networkID string, epInfo *Endpoint
514516
nw := &network{
515517
Id: networkID, // currently unused in stateless cni
516518
HnsId: epInfo.HNSNetworkID,
517-
Mode: opModeTransparentVlan,
519+
Mode: opModeTransparent,
518520
SnatBridgeIP: "",
519521
NetNs: dummyGUID, // to trigger hns v2, windows
520522
extIf: &externalInterface{
@@ -529,6 +531,7 @@ func (nm *networkManager) DeleteEndpointState(networkID string, epInfo *Endpoint
529531
HNSNetworkID: epInfo.HNSNetworkID, // unused (we use nw.HnsId for deleting the network)
530532
HostIfName: epInfo.HostIfName,
531533
LocalIP: "",
534+
IPAddresses: epInfo.IPAddresses,
532535
VlanID: 0,
533536
AllowInboundFromHostToNC: false, // stateless currently does not support apipa
534537
AllowInboundFromNCToHost: false,
@@ -537,11 +540,12 @@ func (nm *networkManager) DeleteEndpointState(networkID string, epInfo *Endpoint
537540
NetworkContainerID: epInfo.NetworkContainerID, // we don't use this as long as AllowInboundFromHostToNC and AllowInboundFromNCToHost are false
538541
NetNs: dummyGUID, // to trigger hnsv2, windows
539542
NICType: epInfo.NICType,
543+
NetworkNameSpace: epInfo.NetNsPath,
540544
IfName: epInfo.IfName, // TODO: For stateless cni linux populate IfName here to use in deletion in secondary endpoint client
541545
}
542546
logger.Info("Deleting endpoint with", zap.String("Endpoint Info: ", epInfo.PrettyString()), zap.String("HNISID : ", ep.HnsId))
543547

544-
err := nw.deleteEndpointImpl(netlink.NewNetlink(), platform.NewExecClient(logger), nil, nil, nil, nil, nil, ep)
548+
err := nw.deleteEndpointImpl(nm.netlink, nm.plClient, nil, nm.netio, nm.nsClient, nm.iptablesClient, nm.dhcpClient, ep)
545549
if err != nil {
546550
return err
547551
}
@@ -745,6 +749,16 @@ func (nm *networkManager) GetEndpointID(containerID, ifName string) string {
745749
return containerID + "-" + ifName
746750
}
747751

752+
// GetEndpointIDByNicType returns a unique endpoint ID based on the CNI mode and NIC type.
753+
func (nm *networkManager) GetEndpointIDByNicType(containerID, ifName string, nicType cns.NICType) string {
754+
// For stateless CNI, secondary NICs use containerID-ifName as endpointID.
755+
if nm.IsStatelessCNIMode() && nicType != cns.InfraNIC {
756+
return containerID + "-" + ifName
757+
}
758+
// For InfraNIC, use GetEndpointID() logic.
759+
return nm.GetEndpointID(containerID, ifName)
760+
}
761+
748762
// saves the map of network ids to endpoints to the state file
749763
func (nm *networkManager) SaveState(eps []*endpoint) error {
750764
nm.Lock()
@@ -847,3 +861,14 @@ func generateCNSIPInfoMap(eps []*endpoint) map[string]*restserver.IPInfo {
847861

848862
return ifNametoIPInfoMap
849863
}
864+
865+
// RemoveSecondaryEndpointFromPodNetNS removes the secondary endpoint from the pod netns
866+
func (nm *networkManager) RemoveSecondaryEndpointFromPodNetNS(ifName, netns string) error {
867+
ep := &endpoint{
868+
NetworkNameSpace: netns,
869+
IfName: ifName, // TODO: For stateless cni linux populate IfName here to use in deletion in secondary endpoint client
870+
}
871+
logger.Info("Removing Secondary Endpoint from", zap.String("NetworkNameSpace: ", netns))
872+
err := ep.removeSecondaryEndpointFromPodNetNSImpl(nm.nsClient)
873+
return err
874+
}

network/manager_mock.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package network
22

33
import (
4+
"github.com/Azure/azure-container-networking/cns"
45
"github.com/Azure/azure-container-networking/common"
56
)
67

@@ -94,6 +95,16 @@ func (nm *MockNetworkManager) GetEndpointID(containerID, ifName string) string {
9495
return containerID + "-" + ifName
9596
}
9697

98+
// GetEndpointIDByNicType returns a unique endpoint ID based on the CNI mode and NIC type.
99+
func (nm *MockNetworkManager) GetEndpointIDByNicType(containerID, ifName string, nicType cns.NICType) string {
100+
// For stateless CNI, secondary NICs use containerID-ifName as endpointID.
101+
if nm.IsStatelessCNIMode() && nicType != cns.InfraNIC {
102+
return containerID + "-" + ifName
103+
}
104+
// For InfraNIC, use GetEndpointID() logic.
105+
return nm.GetEndpointID(containerID, ifName)
106+
}
107+
97108
func (nm *MockNetworkManager) GetAllEndpoints(networkID string) (map[string]*EndpointInfo, error) {
98109
return nm.TestEndpointInfoMap, nil
99110
}
@@ -210,3 +221,7 @@ func (nm *MockNetworkManager) GetEndpointInfosFromContainerID(containerID string
210221
func (nm *MockNetworkManager) GetEndpointState(_, _ string) ([]*EndpointInfo, error) {
211222
return []*EndpointInfo{}, nil
212223
}
224+
225+
func (nm *MockNetworkManager) RemoveSecondaryEndpointFromPodNetNS(_, _ string) error {
226+
return nil
227+
}

network/manager_test.go

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,3 +489,106 @@ var _ = Describe("Test Manager", func() {
489489
})
490490
})
491491
})
492+
493+
func TestGetEndpointIDByNicType_Cases(t *testing.T) {
494+
nm := &networkManager{}
495+
496+
cases := []struct {
497+
name string
498+
stateless bool
499+
containerID string
500+
ifName string
501+
nicType cns.NICType
502+
expectedResult string
503+
}{
504+
{
505+
name: "Stateless InfraNIC",
506+
stateless: true,
507+
containerID: "container123",
508+
ifName: "eth0",
509+
nicType: cns.InfraNIC,
510+
expectedResult: "container123",
511+
},
512+
{
513+
name: "Stateless SecondaryNIC",
514+
stateless: true,
515+
containerID: "container123",
516+
ifName: "eth1",
517+
nicType: cns.DelegatedVMNIC,
518+
expectedResult: "container123-eth1",
519+
},
520+
{
521+
name: "Stateful InfraNIC",
522+
stateless: false,
523+
containerID: "container123456789",
524+
ifName: "eth0",
525+
nicType: cns.InfraNIC,
526+
expectedResult: "containe-eth0", // truncated to 8 chars
527+
},
528+
}
529+
530+
for _, tc := range cases {
531+
t.Run(tc.name, func(t *testing.T) {
532+
nm.statelessCniMode = tc.stateless
533+
id := nm.GetEndpointIDByNicType(tc.containerID, tc.ifName, tc.nicType)
534+
if id != tc.expectedResult {
535+
t.Errorf("expected %s, got %s", tc.expectedResult, id)
536+
}
537+
})
538+
}
539+
}
540+
541+
func TestCnsEndpointInfotoCNIEpInfos_Cases(t *testing.T) {
542+
cases := []struct {
543+
name string
544+
ifName string
545+
ipInfo restserver.IPInfo
546+
expectedNetNs string
547+
expectedIfName string
548+
expectedNICType cns.NICType
549+
}{
550+
{
551+
name: "DelegatedVMNIC",
552+
ifName: "eth1",
553+
ipInfo: restserver.IPInfo{
554+
NICType: cns.DelegatedVMNIC,
555+
},
556+
expectedNetNs: "/var/run/netns/testns",
557+
expectedIfName: "eth1",
558+
expectedNICType: cns.DelegatedVMNIC,
559+
},
560+
{
561+
name: "InfraNIC",
562+
ifName: "eth0",
563+
ipInfo: restserver.IPInfo{
564+
NICType: cns.InfraNIC,
565+
},
566+
expectedNetNs: "",
567+
expectedIfName: "eth0",
568+
expectedNICType: cns.InfraNIC,
569+
},
570+
}
571+
572+
for _, tc := range cases {
573+
t.Run(tc.name, func(t *testing.T) {
574+
endpointInfo := restserver.EndpointInfo{
575+
IfnameToIPMap: map[string]*restserver.IPInfo{
576+
tc.ifName: &tc.ipInfo,
577+
},
578+
}
579+
epInfos := cnsEndpointInfotoCNIEpInfos(endpointInfo, "container123")
580+
if len(epInfos) == 0 {
581+
t.Fatalf("expected at least one epInfo")
582+
}
583+
if epInfos[0].NetNsPath != tc.expectedNetNs {
584+
t.Errorf("expected NetNsPath %q, got %q", tc.expectedNetNs, epInfos[0].NetNsPath)
585+
}
586+
if epInfos[0].IfName != tc.expectedIfName {
587+
t.Errorf("expected IfName %q, got %q", tc.expectedIfName, epInfos[0].IfName)
588+
}
589+
if epInfos[0].NICType != tc.expectedNICType {
590+
t.Errorf("expected NICType %v, got %v", tc.expectedNICType, epInfos[0].NICType)
591+
}
592+
})
593+
}
594+
}

0 commit comments

Comments
 (0)