fix: [NPM-WIN] lock the policyMap & updatePodCache (#1543)

huntergregory · vakalapa · web-flow · commit cfe483b95389 · 2022-09-02T13:44:03.000-07:00
* add policyMap lock

* add updatePodCache lock

* only lock policyMap in Windows and remove dead code

* defer unlocking and update comments

* debugging logs (remove later)

* more logs to remove

* revert test log commits

* aggregate errors while updating pods

* update log

* lock endpoint cache and remove pendingPolicies map

* refresh endpoints only before beginning all updatePod calls

* make policyMap lock for Linux too

Co-authored-by: Vamsi Kalapala &lt;vakr@microsoft.com&gt;
diff --git a/npm/pkg/dataplane/dataplane.go b/npm/pkg/dataplane/dataplane.go
@@ -2,6 +2,7 @@ package dataplane
 
 import (
 	"fmt"
+	"sync"
 	"time"
 
 	"github.com/Azure/azure-container-networking/common"
@@ -23,6 +24,24 @@ type Config struct {
 	*policies.PolicyManagerCfg
 }
 
+type updatePodCache struct {
+	sync.Mutex
+	cache map[string]*updateNPMPod
+}
+
+func newUpdatePodCache() *updatePodCache {
+	return &updatePodCache{cache: make(map[string]*updateNPMPod)}
+}
+
+type endpointCache struct {
+	sync.Mutex
+	cache map[string]*npmEndpoint
+}
+
+func newEndpointCache() *endpointCache {
+	return &endpointCache{cache: make(map[string]*npmEndpoint)}
+}
+
 type DataPlane struct {
 	*Config
 	policyMgr *policies.PolicyManager
@@ -31,13 +50,10 @@ type DataPlane struct {
 	nodeName  string
 	// endpointCache stores all endpoints of the network (including off-node)
 	// Key is PodIP
-	endpointCache  map[string]*npmEndpoint
+	endpointCache  *endpointCache
 	ioShim         *common.IOShim
-	updatePodCache map[string]*updateNPMPod
-	// pendingPolicies includes the policy keys of policies which may
-	// be referenced by ipsets but have not been applied to the kernel yet
-	pendingPolicies map[string]struct{}
-	stopChannel     <-chan struct{}
+	updatePodCache *updatePodCache
+	stopChannel    <-chan struct{}
 }
 
 func NewDataPlane(nodeName string, ioShim *common.IOShim, cfg *Config, stopChannel <-chan struct{}) (*DataPlane, error) {
@@ -47,15 +63,14 @@ func NewDataPlane(nodeName string, ioShim *common.IOShim, cfg *Config, stopChann
 		cfg.IPSetManagerCfg.AddEmptySetToLists = true
 	}
 	dp := &DataPlane{
-		Config:          cfg,
-		policyMgr:       policies.NewPolicyManager(ioShim, cfg.PolicyManagerCfg),
-		ipsetMgr:        ipsets.NewIPSetManager(cfg.IPSetManagerCfg, ioShim),
-		endpointCache:   make(map[string]*npmEndpoint),
-		nodeName:        nodeName,
-		ioShim:          ioShim,
-		updatePodCache:  make(map[string]*updateNPMPod),
-		pendingPolicies: make(map[string]struct{}),
-		stopChannel:     stopChannel,
+		Config:         cfg,
+		policyMgr:      policies.NewPolicyManager(ioShim, cfg.PolicyManagerCfg),
+		ipsetMgr:       ipsets.NewIPSetManager(cfg.IPSetManagerCfg, ioShim),
+		endpointCache:  newEndpointCache(),
+		nodeName:       nodeName,
+		ioShim:         ioShim,
+		updatePodCache: newUpdatePodCache(),
+		stopChannel:    stopChannel,
 	}
 
 	err := dp.BootupDataplane()
@@ -121,12 +136,19 @@ func (dp *DataPlane) AddToSets(setNames []*ipsets.IPSetMetadata, podMetadata *Po
 	}
 	if dp.shouldUpdatePod() {
 		klog.Infof("[DataPlane] Updating Sets to Add for pod key %s", podMetadata.PodKey)
-		if _, ok := dp.updatePodCache[podMetadata.PodKey]; !ok {
+
+		// lock updatePodCache while reading/modifying or setting the updatePod in the cache
+		dp.updatePodCache.Lock()
+		defer dp.updatePodCache.Unlock()
+
+		updatePod, ok := dp.updatePodCache.cache[podMetadata.PodKey]
+		if !ok {
 			klog.Infof("[DataPlane] {AddToSet} pod key %s not found in updatePodCache. creating a new obj", podMetadata.PodKey)
-			dp.updatePodCache[podMetadata.PodKey] = newUpdateNPMPod(podMetadata)
+			updatePod = newUpdateNPMPod(podMetadata)
+			dp.updatePodCache.cache[podMetadata.PodKey] = updatePod
 		}
 
-		dp.updatePodCache[podMetadata.PodKey].updateIPSetsToAdd(setNames)
+		updatePod.updateIPSetsToAdd(setNames)
 	}
 
 	return nil
@@ -142,12 +164,19 @@ func (dp *DataPlane) RemoveFromSets(setNames []*ipsets.IPSetMetadata, podMetadat
 
 	if dp.shouldUpdatePod() {
 		klog.Infof("[DataPlane] Updating Sets to Remove for pod key %s", podMetadata.PodKey)
-		if _, ok := dp.updatePodCache[podMetadata.PodKey]; !ok {
+
+		// lock updatePodCache while reading/modifying or setting the updatePod in the cache
+		dp.updatePodCache.Lock()
+		defer dp.updatePodCache.Unlock()
+
+		updatePod, ok := dp.updatePodCache.cache[podMetadata.PodKey]
+		if !ok {
 			klog.Infof("[DataPlane] {RemoveFromSet} pod key %s not found in updatePodCache. creating a new obj", podMetadata.PodKey)
-			dp.updatePodCache[podMetadata.PodKey] = newUpdateNPMPod(podMetadata)
+			updatePod = newUpdateNPMPod(podMetadata)
+			dp.updatePodCache.cache[podMetadata.PodKey] = updatePod
 		}
 
-		dp.updatePodCache[podMetadata.PodKey].updateIPSetsToRemove(setNames)
+		updatePod.updateIPSetsToRemove(setNames)
 	}
 
 	return nil
@@ -185,13 +214,33 @@ func (dp *DataPlane) ApplyDataPlane() error {
 	}
 
 	if dp.shouldUpdatePod() {
-		for podKey, pod := range dp.updatePodCache {
+		err := dp.refreshAllPodEndpoints()
+		if err != nil {
+			metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, "[DataPlane] failed to refresh endpoints while updating pods. err: [%s]", err.Error())
+			return fmt.Errorf("[DataPlane] failed to refresh endpoints while updating pods. err: [%w]", err)
+		}
+
+		// lock updatePodCache while driving goal state to kernel
+		// prevents another ApplyDataplane call from updating the same pods
+		dp.updatePodCache.Lock()
+		defer dp.updatePodCache.Unlock()
+
+		var aggregateErr error
+		for podKey, pod := range dp.updatePodCache.cache {
 			err := dp.updatePod(pod)
 			if err != nil {
-				metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, "error: failed to update pods: %s", err.Error())
-				return fmt.Errorf("[DataPlane] error while updating pod: %w", err)
+				if aggregateErr == nil {
+					aggregateErr = fmt.Errorf("failed to update pod while applying the dataplane. key: [%s], err: [%w]", podKey, err)
+				} else {
+					aggregateErr = fmt.Errorf("failed to update pod while applying the dataplane. key: [%s], err: [%s]. previous err: [%w]", podKey, err.Error(), aggregateErr)
+				}
+				metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, "failed to update pod while applying the dataplane. key: [%s], err: [%s]", podKey, err.Error())
+				continue
 			}
-			delete(dp.updatePodCache, podKey)
+			delete(dp.updatePodCache.cache, podKey)
+		}
+		if aggregateErr != nil {
+			return fmt.Errorf("[DataPlane] error while updating pods: %w", err)
 		}
 	}
 	return nil
@@ -201,8 +250,6 @@ func (dp *DataPlane) ApplyDataPlane() error {
 func (dp *DataPlane) AddPolicy(policy *policies.NPMNetworkPolicy) error {
 	klog.Infof("[DataPlane] Add Policy called for %s", policy.PolicyKey)
 
-	dp.pendingPolicies[policy.PolicyKey] = struct{}{}
-
 	// Create and add references for Selector IPSets first
 	err := dp.createIPSetsAndReferences(policy.AllPodSelectorIPSets(), policy.PolicyKey, ipsets.SelectorType)
 	if err != nil {
@@ -232,7 +279,6 @@ func (dp *DataPlane) AddPolicy(policy *policies.NPMNetworkPolicy) error {
 	if err != nil {
 		return fmt.Errorf("[DataPlane] error while adding policy: %w", err)
 	}
-	delete(dp.pendingPolicies, policy.PolicyKey)
 	return nil
 }
 
@@ -301,8 +347,9 @@ func (dp *DataPlane) GetAllIPSets() map[string]string {
 	return dp.ipsetMgr.GetAllIPSets()
 }
 
+// GetAllPolicies is deprecated and only used in the goalstateprocessor, which is deprecated
 func (dp *DataPlane) GetAllPolicies() []string {
-	return dp.policyMgr.GetAllPolicies()
+	return nil
 }
 
 func (dp *DataPlane) createIPSetsAndReferences(sets []*ipsets.TranslatedIPSet, netpolName string, referenceType ipsets.ReferenceType) error {
diff --git a/npm/pkg/dataplane/dataplane_linux.go b/npm/pkg/dataplane/dataplane_linux.go
@@ -6,16 +6,16 @@ import (
 )
 
 func (dp *DataPlane) getEndpointsToApplyPolicy(policy *policies.NPMNetworkPolicy) (map[string]string, error) {
-	// NOOP in Linux at the moment
+	// NOOP in Linux
 	return nil, nil
 }
 
 func (dp *DataPlane) shouldUpdatePod() bool {
 	return false
 }
 
-// updatePod is no-op in Linux
 func (dp *DataPlane) updatePod(pod *updateNPMPod) error {
+	// NOOP in Linux
 	return nil
 }
 
@@ -29,3 +29,8 @@ func (dp *DataPlane) bootupDataPlane() error {
 	}
 	return nil
 }
+
+func (dp *DataPlane) refreshAllPodEndpoints() error {
+	// NOOP in Linux
+	return nil
+}
diff --git a/npm/pkg/dataplane/dataplane_windows.go b/npm/pkg/dataplane/dataplane_windows.go
@@ -43,7 +43,8 @@ func (dp *DataPlane) initializeDataPlane() error {
 	}
 
 	// reset endpoint cache so that netpol references are removed for all endpoints while refreshing pod endpoints
-	dp.endpointCache = make(map[string]*npmEndpoint)
+	// no need to lock endpointCache at boot up
+	dp.endpointCache.cache = make(map[string]*npmEndpoint)
 	err = dp.refreshAllPodEndpoints()
 	if err != nil {
 		return err
@@ -111,14 +112,12 @@ func (dp *DataPlane) updatePod(pod *updateNPMPod) error {
 		return nil
 	}
 
-	err := dp.refreshAllPodEndpoints()
-	if err != nil {
-		klog.Infof("[DataPlane] failed to refresh endpoints in updatePod with %s", err.Error())
-		return err
-	}
+	// lock the endpoint cache while we read/modify the endpoint with the pod's IP
+	dp.endpointCache.Lock()
+	defer dp.endpointCache.Unlock()
 
 	// Check if pod is already present in cache
-	endpoint, ok := dp.endpointCache[pod.PodIP]
+	endpoint, ok := dp.endpointCache.cache[pod.PodIP]
 	if !ok {
 		// ignore this err and pod endpoint will be deleted in ApplyDP
 		// if the endpoint is not found, it means the pod is not part of this node or pod got deleted.
@@ -133,7 +132,7 @@ func (dp *DataPlane) updatePod(pod *updateNPMPod) error {
 			// Updates to this pod would not occur. Pod IPs are expected to change on restart though.
 			// See: https://stackoverflow.com/questions/52362514/when-will-the-kubernetes-pod-ip-change
 			// If a pod does restart and take up its previous IP, then the pod can be deleted/restarted to mitigate this problem.
-			klog.Infof("ignoring pod update since pod with key %s is stale and likely was deleted", pod.PodKey)
+			klog.Infof("[DataPlane] ignoring pod update since pod with key %s is stale and likely was deleted", pod.PodKey)
 			return nil
 		}
 		endpoint.podKey = pod.PodKey
@@ -175,8 +174,10 @@ func (dp *DataPlane) updatePod(pod *updateNPMPod) error {
 		}
 
 		for policyKey := range selectorReference {
-			if _, ok := dp.pendingPolicies[policyKey]; !ok {
+			if dp.policyMgr.PolicyExists(policyKey) {
 				toAddPolicies[policyKey] = struct{}{}
+			} else {
+				klog.Infof("[DataPlane] while updating pod, policy is referenced but does not exist. pod: [%s], policy: [%s], set [%s]", pod.PodKey, policyKey, setName)
 			}
 		}
 	}
@@ -229,21 +230,19 @@ func (dp *DataPlane) getSelectorIPSets(policy *policies.NPMNetworkPolicy) map[st
 }
 
 func (dp *DataPlane) getEndpointsToApplyPolicy(policy *policies.NPMNetworkPolicy) (map[string]string, error) {
-	err := dp.refreshAllPodEndpoints()
-	if err != nil {
-		klog.Infof("[DataPlane] failed to refresh endpoints in getEndpointsToApplyPolicy with %s", err.Error())
-		return nil, err
-	}
-
 	selectorIPSets := dp.getSelectorIPSets(policy)
 	netpolSelectorIPs, err := dp.ipsetMgr.GetIPsFromSelectorIPSets(selectorIPSets)
 	if err != nil {
 		return nil, err
 	}
 
+	// lock the endpoint cache while we read/modify the endpoints with IPs in the policy's pod selector
+	dp.endpointCache.Lock()
+	defer dp.endpointCache.Unlock()
+
 	endpointList := make(map[string]string)
 	for ip := range netpolSelectorIPs {
-		endpoint, ok := dp.endpointCache[ip]
+		endpoint, ok := dp.endpointCache.cache[ip]
 		if !ok {
 			klog.Infof("[DataPlane] Ignoring endpoint with IP %s since it was not found in the endpoint cache. This IP might not be in the HNS network", ip)
 			continue
@@ -264,12 +263,32 @@ func (dp *DataPlane) getAllPodEndpoints() ([]hcn.HostComputeEndpoint, error) {
 }
 
 // refreshAllPodEndpoints will refresh all the pod endpoints and create empty netpol references for new endpoints
+/*
+Key Assumption: a new pod event (w/ IP) cannot come before HNS knows (and can tell us) about the endpoint.
+From NPM logs, it seems that endpoints are updated far earlier (several seconds) before the pod event comes in.
+
+What we learn from refreshing endpoints:
+- an old endpoint doesn't exist anymore
+- a new endpoint has come up
+
+Why not refresh when adding a netpol to all required pods?
+- It's ok if we try to apply on an endpoint that doesn't exist anymore.
+- We won't know the pod associated with a new endpoint even if we refresh.
+
+Why can we refresh only once before updating all pods in the updatePodCache (see ApplyDataplane)?
+- Again, it's ok if we try to apply on a non-existent endpoint.
+- We won't miss the endpoint (see the assumption). At the time the pod event came in (when AddToSets/RemoveFromSets were called), HNS already knew about the endpoint.
+*/
 func (dp *DataPlane) refreshAllPodEndpoints() error {
 	endpoints, err := dp.getAllPodEndpoints()
 	if err != nil {
 		return err
 	}
 
+	// lock the endpoint cache while we reconcile with HNS goal state
+	dp.endpointCache.Lock()
+	defer dp.endpointCache.Unlock()
+
 	currentTime := time.Now().Unix()
 	existingIPs := make(map[string]struct{})
 	for _, endpoint := range endpoints {
@@ -285,11 +304,11 @@ func (dp *DataPlane) refreshAllPodEndpoints() error {
 
 		existingIPs[ip] = struct{}{}
 
-		oldNPMEP, ok := dp.endpointCache[ip]
+		oldNPMEP, ok := dp.endpointCache.cache[ip]
 		if !ok {
 			// add the endpoint to the cache if it's not already there
 			npmEP := newNPMEndpoint(&endpoint)
-			dp.endpointCache[ip] = npmEP
+			dp.endpointCache.cache[ip] = npmEP
 			// NOTE: TSGs rely on this log line
 			klog.Infof("updating endpoint cache to include %s: %+v", npmEP.ip, npmEP)
 		} else if oldNPMEP.id != endpoint.Id {
@@ -299,29 +318,29 @@ func (dp *DataPlane) refreshAllPodEndpoints() error {
 			npmEP := newNPMEndpoint(&endpoint)
 			if oldNPMEP.podKey == unspecifiedPodKey {
 				klog.Infof("updating endpoint cache since endpoint changed for IP which never had a pod key. new endpoint: %s, old endpoint: %s, ip: %s", npmEP.id, oldNPMEP.id, npmEP.ip)
-				dp.endpointCache[ip] = npmEP
+				dp.endpointCache.cache[ip] = npmEP
 			} else {
 				npmEP.stalePodKey = &staleKey{
 					key:       oldNPMEP.podKey,
 					timestamp: currentTime,
 				}
-				dp.endpointCache[ip] = npmEP
+				dp.endpointCache.cache[ip] = npmEP
 				// NOTE: TSGs rely on this log line
 				klog.Infof("updating endpoint cache for previously cached IP %s: %+v with stalePodKey %+v", npmEP.ip, npmEP, npmEP.stalePodKey)
 			}
 		}
 	}
 
 	// garbage collection for the endpoint cache
-	for ip, ep := range dp.endpointCache {
+	for ip, ep := range dp.endpointCache.cache {
 		if _, ok := existingIPs[ip]; !ok {
 			if ep.podKey == unspecifiedPodKey {
 				if ep.stalePodKey == nil {
 					klog.Infof("deleting old endpoint which never had a pod key. ID: %s, IP: %s", ep.id, ip)
-					delete(dp.endpointCache, ip)
+					delete(dp.endpointCache.cache, ip)
 				} else if int(currentTime-ep.stalePodKey.timestamp)/60 > minutesToKeepStalePodKey {
 					klog.Infof("deleting old endpoint which had a stale pod key. ID: %s, IP: %s, stalePodKey: %+v", ep.id, ip, ep.stalePodKey)
-					delete(dp.endpointCache, ip)
+					delete(dp.endpointCache.cache, ip)
 				}
 			} else {
 				ep.stalePodKey = &staleKey{
@@ -349,8 +368,9 @@ func (dp *DataPlane) setNetworkIDByName(networkName string) error {
 }
 
 func (dp *DataPlane) getAllEndpointIDs() []string {
-	endpointIDs := make([]string, 0, len(dp.endpointCache))
-	for _, endpoint := range dp.endpointCache {
+	// no need to lock endpointCache at boot up
+	endpointIDs := make([]string, 0, len(dp.endpointCache.cache))
+	for _, endpoint := range dp.endpointCache.cache {
 		endpointIDs = append(endpointIDs, endpoint.id)
 	}
 	return endpointIDs
diff --git a/npm/pkg/dataplane/policies/chain-management_linux.go b/npm/pkg/dataplane/policies/chain-management_linux.go
@@ -168,7 +168,7 @@ func isBaseChain(chain string) bool {
 func (pMgr *PolicyManager) bootup(_ []string) error {
 	klog.Infof("booting up iptables Azure chains")
 
-	// Stop reconciling so we don't centend for iptables, and so we don't update the staleChains at the same time as reconcile()
+	// Stop reconciling so we don't contend for iptables, and so we don't update the staleChains at the same time as reconcile()
 	// Reconciling would only be happening if this function were called to reset iptables well into the azure-npm pod lifecycle.
 	pMgr.reconcileManager.forceLock()
 	defer pMgr.reconcileManager.forceUnlock()
diff --git a/npm/pkg/dataplane/policies/policymanager.go b/npm/pkg/dataplane/policies/policymanager.go
diff --git a/npm/pkg/dataplane/types.go b/npm/pkg/dataplane/types.go

Original file line number	Diff line number	Diff line change
`@@ -6,16 +6,16 @@ import (`
`6`	`6`	`)`
`7`	`7`
`8`	`8`	`func (dp DataPlane) getEndpointsToApplyPolicy(policy policies.NPMNetworkPolicy) (map[string]string, error) {`
`9`		`- // NOOP in Linux at the moment`
	`9`	`+ // NOOP in Linux`
`10`	`10`	`return nil, nil`
`11`	`11`	`}`
`12`	`12`
`13`	`13`	`func (dp *DataPlane) shouldUpdatePod() bool {`
`14`	`14`	`return false`
`15`	`15`	`}`
`16`	`16`
`17`		`-// updatePod is no-op in Linux`
`18`	`17`	`func (dp DataPlane) updatePod(pod updateNPMPod) error {`
	`18`	`+ // NOOP in Linux`
`19`	`19`	`return nil`
`20`	`20`	`}`
`21`	`21`
`@@ -29,3 +29,8 @@ func (dp *DataPlane) bootupDataPlane() error {`
`29`	`29`	`}`
`30`	`30`	`return nil`
`31`	`31`	`}`
	`32`	`+`
	`33`	`+func (dp *DataPlane) refreshAllPodEndpoints() error {`
	`34`	`+ // NOOP in Linux`
	`35`	`+ return nil`
	`36`	`+}`