Skip to content

Commit a9fccfa

Browse files
behzad-mirtamilmani1989rbtr
authored
fix: Handle async delete in stateless cni (#2967)
* feat: adding stateless CNI pipeline test * feat: making change for stateless CNI pipeline * feat: addressing the comments * fix: fixing stateles cni yaml * fix: stateless CNI delete fix * fix: addressing the comments * fix: addressing the comments and fix linter issues * Update cns/fsnotify/fsnotify.go Co-authored-by: tamilmani1989 <[email protected]> Signed-off-by: Behzad Mirkhanzadeh <[email protected]> * Update cni/network/network.go Co-authored-by: tamilmani1989 <[email protected]> Signed-off-by: Behzad Mirkhanzadeh <[email protected]> * Update cni/network/network.go Co-authored-by: tamilmani1989 <[email protected]> Signed-off-by: Behzad Mirkhanzadeh <[email protected]> * fix: addressing the comments * fix: fix the error code. * Fix: decoupling hnsclient form CNS watcher * fix: adding endpointmanager package to resolve platfrom specific call to HNS * Update cns/endpointmanager/endpointmanager_linux.go Co-authored-by: Evan Baker <[email protected]> Signed-off-by: Behzad Mirkhanzadeh <[email protected]> * Update cns/service/main.go Co-authored-by: Evan Baker <[email protected]> Signed-off-by: Behzad Mirkhanzadeh <[email protected]> * Fix: addressing the comments * fix: removing stateless CNI pipline changes form the PR * Update cns/configuration/configuration.go Co-authored-by: Evan Baker <[email protected]> Signed-off-by: Behzad Mirkhanzadeh <[email protected]> * addressing the comment --------- Signed-off-by: Behzad Mirkhanzadeh <[email protected]> Co-authored-by: tamilmani1989 <[email protected]> Co-authored-by: Evan Baker <[email protected]>
1 parent 4c0eb94 commit a9fccfa

File tree

13 files changed

+213
-18
lines changed

13 files changed

+213
-18
lines changed

cni/network/network.go

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"github.com/Azure/azure-container-networking/cni/util"
2121
"github.com/Azure/azure-container-networking/cns"
2222
cnscli "github.com/Azure/azure-container-networking/cns/client"
23+
"github.com/Azure/azure-container-networking/cns/fsnotify"
2324
"github.com/Azure/azure-container-networking/common"
2425
"github.com/Azure/azure-container-networking/dhcp"
2526
"github.com/Azure/azure-container-networking/iptables"
@@ -1131,18 +1132,38 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error {
11311132
// network ID is passed in and used only for migration
11321133
// otherwise, in stateless, we don't need the network id for deletion
11331134
epInfos, err = plugin.nm.GetEndpointState(networkID, args.ContainerID)
1135+
// if stateless CNI fail to get the endpoint from CNS for any reason other than Endpoint Not found
1136+
if err != nil {
1137+
if errors.Is(err, network.ErrConnectionFailure) {
1138+
logger.Info("failed to connect to CNS", zap.String("containerID", args.ContainerID), zap.Error(err))
1139+
addErr := fsnotify.AddFile(args.ContainerID, args.ContainerID, watcherPath)
1140+
logger.Info("add containerid file for Asynch delete", zap.String("containerID", args.ContainerID), zap.Error(addErr))
1141+
if addErr != nil {
1142+
logger.Error("failed to add file to watcher", zap.String("containerID", args.ContainerID), zap.Error(addErr))
1143+
return errors.Wrap(addErr, fmt.Sprintf("failed to add file to watcher with containerID %s", args.ContainerID))
1144+
}
1145+
return nil
1146+
}
1147+
if errors.Is(err, network.ErrEndpointStateNotFound) {
1148+
logger.Info("Endpoint Not found", zap.String("containerID", args.ContainerID), zap.Error(err))
1149+
return nil
1150+
}
1151+
logger.Error("Get Endpoint State API returned error", zap.String("containerID", args.ContainerID), zap.Error(err))
1152+
return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err))
1153+
}
11341154
} else {
11351155
epInfos = plugin.nm.GetEndpointInfosFromContainerID(args.ContainerID)
11361156
}
11371157

11381158
// for when the endpoint is not created, but the ips are already allocated (only works if single network, single infra)
1139-
// stateless cni won't have this issue
1159+
// this block is not applied to stateless CNI
11401160
if len(epInfos) == 0 {
11411161
endpointID := plugin.nm.GetEndpointID(args.ContainerID, args.IfName)
11421162
if !nwCfg.MultiTenancy {
11431163
logger.Error("Failed to query endpoint",
11441164
zap.String("endpoint", endpointID),
11451165
zap.Error(err))
1166+
11461167
logger.Error("Release ip by ContainerID (endpoint not found)",
11471168
zap.String("containerID", args.ContainerID))
11481169
sendEvent(plugin, fmt.Sprintf("Release ip by ContainerID (endpoint not found):%v", args.ContainerID))

cns/client/client.go

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,30 +1028,33 @@ func (c *Client) GetEndpoint(ctx context.Context, endpointID string) (*restserve
10281028
// build the request
10291029
u := c.routes[cns.EndpointAPI]
10301030
uString := u.String() + endpointID
1031+
var response restserver.GetEndpointResponse
10311032
req, err := http.NewRequestWithContext(ctx, http.MethodGet, uString, http.NoBody)
10321033
if err != nil {
1033-
return nil, errors.Wrap(err, "failed to build request")
1034+
response.Response.ReturnCode = types.UnexpectedError
1035+
return &response, errors.Wrap(err, "failed to build request")
10341036
}
1037+
10351038
req.Header.Set(headerContentType, contentTypeJSON)
10361039
res, err := c.client.Do(req)
10371040
if err != nil {
1038-
return nil, errors.Wrap(err, "http request failed")
1041+
response.Response.ReturnCode = types.ConnectionError
1042+
return &response, &ConnectionFailureErr{cause: err}
10391043
}
10401044

10411045
defer res.Body.Close()
10421046

10431047
if res.StatusCode != http.StatusOK {
1044-
return nil, errors.Errorf("http response %d", res.StatusCode)
1048+
response.Response.ReturnCode = types.UnexpectedError
1049+
return &response, errors.Errorf("http response %d", res.StatusCode)
10451050
}
1046-
1047-
var response restserver.GetEndpointResponse
10481051
err = json.NewDecoder(res.Body).Decode(&response)
10491052
if err != nil {
1050-
return nil, errors.Wrap(err, "failed to decode GetEndpointResponse")
1053+
response.Response.ReturnCode = types.UnexpectedError
1054+
return &response, errors.Wrap(err, "failed to decode GetEndpointResponse")
10511055
}
1052-
10531056
if response.Response.ReturnCode != 0 {
1054-
return nil, errors.New(response.Response.Message)
1057+
return &response, errors.New(response.Response.Message)
10551058
}
10561059

10571060
return &response, nil
@@ -1076,7 +1079,7 @@ func (c *Client) UpdateEndpoint(ctx context.Context, endpointID string, ipInfo m
10761079
req.Header.Set(headerContentType, contentTypeJSON)
10771080
res, err := c.client.Do(req)
10781081
if err != nil {
1079-
return nil, errors.Wrap(err, "http request failed with error from server")
1082+
return nil, &ConnectionFailureErr{cause: err}
10801083
}
10811084

10821085
defer res.Body.Close()

cns/configuration/configuration.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"encoding/json"
66
"os"
77
"path/filepath"
8+
"runtime"
89
"strings"
910

1011
"github.com/Azure/azure-container-networking/cns"
@@ -231,3 +232,8 @@ func SetCNSConfigDefaults(config *CNSConfig) {
231232
config.GRPCSettings.Enable = false
232233
config.WatchPods = config.EnableIPAMv2 || config.EnableSwiftV2
233234
}
235+
236+
// isStalessCNIMode verify if the CNI is running stateless mode
237+
func (cnsconfig *CNSConfig) IsStalessCNIWindows() bool {
238+
return !cnsconfig.InitializeFromCNI && cnsconfig.ManageEndpointState && runtime.GOOS == "windows"
239+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package endpointmanager
2+
3+
import (
4+
"context"
5+
6+
"github.com/Azure/azure-container-networking/cns"
7+
"github.com/Azure/azure-container-networking/cns/restserver"
8+
)
9+
10+
type EndpointManager struct {
11+
cli releaseIPsClient // nolint
12+
}
13+
14+
type releaseIPsClient interface {
15+
ReleaseIPs(ctx context.Context, ipconfig cns.IPConfigsRequest) error
16+
GetEndpoint(ctx context.Context, endpointID string) (*restserver.GetEndpointResponse, error)
17+
}
18+
19+
func WithPlatformReleaseIPsManager(cli releaseIPsClient) *EndpointManager {
20+
return &EndpointManager{cli: cli}
21+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
package endpointmanager
2+
3+
import (
4+
"context"
5+
6+
"github.com/Azure/azure-container-networking/cns"
7+
"github.com/pkg/errors"
8+
)
9+
10+
// ReleaseIPs implements an Interface in fsnotify for async delete of the HNS endpoint and IP addresses
11+
func (em *EndpointManager) ReleaseIPs(ctx context.Context, ipconfigreq cns.IPConfigsRequest) error {
12+
return errors.Wrap(em.cli.ReleaseIPs(ctx, ipconfigreq), "failed to release IP from CNS")
13+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
package endpointmanager
2+
3+
import (
4+
"context"
5+
6+
"github.com/Azure/azure-container-networking/cns"
7+
"github.com/Azure/azure-container-networking/cns/hnsclient"
8+
"github.com/Azure/azure-container-networking/cns/logger"
9+
"github.com/pkg/errors"
10+
)
11+
12+
// ReleaseIPs implements an Interface in fsnotify for async delete of the HNS endpoint and IP addresses
13+
func (em *EndpointManager) ReleaseIPs(ctx context.Context, ipconfigreq cns.IPConfigsRequest) error {
14+
logger.Printf("deleting HNS Endpoint asynchronously")
15+
// remove HNS endpoint
16+
if err := em.deleteEndpoint(ctx, ipconfigreq.InfraContainerID); err != nil {
17+
logger.Errorf("failed to remove HNS endpoint %s", err.Error())
18+
}
19+
return errors.Wrap(em.cli.ReleaseIPs(ctx, ipconfigreq), "failed to release IP from CNS")
20+
}
21+
22+
// deleteEndpoint API to get the state and then remove assiciated HNS
23+
func (em *EndpointManager) deleteEndpoint(ctx context.Context, containerid string) error {
24+
endpointResponse, err := em.cli.GetEndpoint(ctx, containerid)
25+
if err != nil {
26+
return errors.Wrap(err, "failed to read the endpoint from CNS state")
27+
}
28+
for _, ipInfo := range endpointResponse.EndpointInfo.IfnameToIPMap {
29+
hnsEndpointID := ipInfo.HnsEndpointID
30+
// we need to get the HNSENdpoint via the IP address if the HNSEndpointID is not present in the statefile
31+
if ipInfo.HnsEndpointID == "" {
32+
if hnsEndpointID, err = hnsclient.GetHNSEndpointbyIP(ipInfo.IPv4, ipInfo.IPv6); err != nil {
33+
return errors.Wrap(err, "failed to find HNS endpoint with id")
34+
}
35+
}
36+
logger.Printf("deleting HNS Endpoint with id %v", hnsEndpointID)
37+
if err := hnsclient.DeleteHNSEndpointbyID(hnsEndpointID); err != nil {
38+
return errors.Wrap(err, "failed to delete HNS endpoint with id "+ipInfo.HnsEndpointID)
39+
}
40+
}
41+
return nil
42+
}

cns/fsnotify/fsnotify.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ import (
1515
"golang.org/x/sync/errgroup"
1616
)
1717

18-
type releaseIPsClient interface {
18+
type ReleaseIPsClient interface {
1919
ReleaseIPs(ctx context.Context, ipconfig cns.IPConfigsRequest) error
2020
}
2121

2222
type watcher struct {
23-
cli releaseIPsClient
23+
cli ReleaseIPsClient
2424
path string
2525
log *zap.Logger
2626

@@ -29,7 +29,7 @@ type watcher struct {
2929
}
3030

3131
// Create the AsyncDelete watcher.
32-
func New(cli releaseIPsClient, path string, logger *zap.Logger) (*watcher, error) { //nolint
32+
func New(cli ReleaseIPsClient, path string, logger *zap.Logger) (*watcher, error) { //nolint
3333
// Add directory where intended deletes are kept
3434
if err := os.Mkdir(path, 0o755); err != nil && !errors.Is(err, fs.ErrExist) { //nolint
3535
logger.Error("error making directory", zap.String("path", path), zap.Error(err))

cns/hnsclient/hnsclient_windows.go

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"github.com/Azure/azure-container-networking/network/policy"
1515
"github.com/Microsoft/hcsshim"
1616
"github.com/Microsoft/hcsshim/hcn"
17+
"github.com/pkg/errors"
1718
)
1819

1920
const (
@@ -75,6 +76,9 @@ const (
7576

7677
// signals a APIPA endpoint type
7778
apipaEndpointType = "APIPA"
79+
80+
// default network name used by HNS
81+
defaultNetworkName = "azure"
7882
)
7983

8084
// Named Lock for network and endpoint creation/deletion
@@ -685,3 +689,68 @@ func DeleteHostNCApipaEndpoint(
685689

686690
return nil
687691
}
692+
693+
// DeleteHNSEndpointbyID deletes the HNS endpoint
694+
func DeleteHNSEndpointbyID(hnsEndpointID string) error {
695+
var (
696+
hcnEndpoint *hcn.HostComputeEndpoint
697+
err error
698+
)
699+
700+
logger.Printf("Deleting hcn endpoint with id %v", hnsEndpointID)
701+
hcnEndpoint, err = hcn.GetEndpointByID(hnsEndpointID)
702+
if err != nil {
703+
// If error is anything other than EndpointNotFoundError, return error.
704+
// else log the error but don't return error because endpoint is already deleted.
705+
var notFoundErr hcn.EndpointNotFoundError
706+
if errors.As(err, &notFoundErr) {
707+
return fmt.Errorf("Failed to get hcn endpoint with id: %s due to err: %w", hnsEndpointID, err)
708+
}
709+
710+
logger.Errorf("Delete called on the Endpoint which doesn't exist. Error:%v", err)
711+
return nil
712+
}
713+
714+
// Remove this endpoint from the namespace
715+
if err = hcn.RemoveNamespaceEndpoint(hcnEndpoint.HostComputeNamespace, hcnEndpoint.Id); err != nil {
716+
logger.Errorf("Failed to remove hcn endpoint %s from namespace %s due to err: %v", hcnEndpoint.Id, hcnEndpoint.HostComputeNamespace, err)
717+
}
718+
719+
if err = hcnEndpoint.Delete(); err != nil {
720+
return fmt.Errorf("Failed to delete endpoint: %s. Error: %w", hnsEndpointID, err)
721+
}
722+
723+
logger.Errorf("[Azure CNS] Successfully deleted endpoint: %+v", hnsEndpointID)
724+
725+
return nil
726+
}
727+
728+
// GetHNSEndpointbyIP returns an HNSEndpoint with the corrsponding HNS Endpoint ID that matches an specific IP Address.
729+
func GetHNSEndpointbyIP(ipv4, ipv6 []net.IPNet) (string, error) {
730+
logger.Printf("Fetching missing HNS endpoint id for endpoints in network with id %s", defaultNetworkName)
731+
hnsResponse, err := hcn.GetNetworkByName(defaultNetworkName)
732+
if err != nil || hnsResponse == nil {
733+
return "", errors.Wrapf(err, "HNS Network or endpoints not found")
734+
}
735+
hcnEndpoints, err := hcn.ListEndpointsOfNetwork(hnsResponse.Id)
736+
if err != nil {
737+
return "", errors.Wrapf(err, "failed to fetch HNS endpoints for the given network")
738+
}
739+
for i := range hcnEndpoints {
740+
for _, ipConfiguration := range hcnEndpoints[i].IpConfigurations {
741+
for _, ip := range ipv4 {
742+
if ipConfiguration.IpAddress == ip.IP.String() {
743+
logger.Printf("Successfully found hcn endpoint id for endpoint %s with ip %s", hcnEndpoints[i].Id, ip.IP.String())
744+
return hcnEndpoints[i].Id, nil
745+
}
746+
}
747+
for _, ip := range ipv6 {
748+
if ipConfiguration.IpAddress == ip.IP.String() {
749+
logger.Printf("Successfully found hcn endpoint id for endpoint %s with ip %s", hcnEndpoints[i].Id, ip.IP.String())
750+
return hcnEndpoints[i].Id, nil
751+
}
752+
}
753+
}
754+
}
755+
return "", errors.Wrapf(err, "No HNSEndpointID matches the IPAddress")
756+
}

cns/restserver/ipam.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1173,7 +1173,7 @@ func (service *HTTPRestService) GetEndpointHelper(endpointID string) (*EndpointI
11731173
} else {
11741174
logger.Errorf("[GetEndpointState] Failed to retrieve state, err:%v", err)
11751175
}
1176-
return nil, errors.Wrap(err, "[GetEndpointState] Failed to retrieve state")
1176+
return nil, ErrEndpointStateNotFound
11771177
}
11781178
if endpointInfo, ok := service.EndpointState[endpointID]; ok {
11791179
logger.Warnf("[GetEndpointState] Found existing endpoint state for container %s", endpointID)

cns/service/main.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/Azure/azure-container-networking/cns/cnireconciler"
2727
"github.com/Azure/azure-container-networking/cns/common"
2828
"github.com/Azure/azure-container-networking/cns/configuration"
29+
"github.com/Azure/azure-container-networking/cns/endpointmanager"
2930
"github.com/Azure/azure-container-networking/cns/fsnotify"
3031
"github.com/Azure/azure-container-networking/cns/grpc"
3132
"github.com/Azure/azure-container-networking/cns/healthserver"
@@ -950,14 +951,22 @@ func main() {
950951

951952
if cnsconfig.EnableAsyncPodDelete {
952953
// Start fs watcher here
954+
z.Info("AsyncPodDelete is enabled")
955+
logger.Printf("AsyncPodDelete is enabled")
953956
cnsclient, err := cnsclient.New("", cnsReqTimeout) //nolint
954957
if err != nil {
955958
z.Error("failed to create cnsclient", zap.Error(err))
956959
}
957960
go func() {
958961
_ = retry.Do(func() error {
959962
z.Info("starting fsnotify watcher to process missed Pod deletes")
960-
w, err := fsnotify.New(cnsclient, cnsconfig.AsyncPodDeletePath, z)
963+
logger.Printf("starting fsnotify watcher to process missed Pod deletes")
964+
var endpointCleanup fsnotify.ReleaseIPsClient = cnsclient
965+
// using endpointmanager implmentation for stateless CNI sceanrio to remove HNS endpoint alongside the IPs
966+
if cnsconfig.IsStalessCNIWindows() {
967+
endpointCleanup = endpointmanager.WithPlatformReleaseIPsManager(cnsclient)
968+
}
969+
w, err := fsnotify.New(endpointCleanup, cnsconfig.AsyncPodDeletePath, z)
961970
if err != nil {
962971
z.Error("failed to create fsnotify watcher", zap.Error(err))
963972
return errors.Wrap(err, "failed to create fsnotify watcher, will retry")

0 commit comments

Comments
 (0)