You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
fix: [NPM-WIN] lock the policyMap & updatePodCache (#1543)
* add policyMap lock
* add updatePodCache lock
* only lock policyMap in Windows and remove dead code
* defer unlocking and update comments
* debugging logs (remove later)
* more logs to remove
* revert test log commits
* aggregate errors while updating pods
* update log
* lock endpoint cache and remove pendingPolicies map
* refresh endpoints only before beginning all updatePod calls
* make policyMap lock for Linux too
Co-authored-by: Vamsi Kalapala <[email protected]>
metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, "[DataPlane] failed to refresh endpoints while updating pods. err: [%s]", err.Error())
220
+
returnfmt.Errorf("[DataPlane] failed to refresh endpoints while updating pods. err: [%w]", err)
221
+
}
222
+
223
+
// lock updatePodCache while driving goal state to kernel
224
+
// prevents another ApplyDataplane call from updating the same pods
225
+
dp.updatePodCache.Lock()
226
+
deferdp.updatePodCache.Unlock()
227
+
228
+
varaggregateErrerror
229
+
forpodKey, pod:=rangedp.updatePodCache.cache {
189
230
err:=dp.updatePod(pod)
190
231
iferr!=nil {
191
-
metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, "error: failed to update pods: %s", err.Error())
192
-
returnfmt.Errorf("[DataPlane] error while updating pod: %w", err)
232
+
ifaggregateErr==nil {
233
+
aggregateErr=fmt.Errorf("failed to update pod while applying the dataplane. key: [%s], err: [%w]", podKey, err)
234
+
} else {
235
+
aggregateErr=fmt.Errorf("failed to update pod while applying the dataplane. key: [%s], err: [%s]. previous err: [%w]", podKey, err.Error(), aggregateErr)
236
+
}
237
+
metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, "failed to update pod while applying the dataplane. key: [%s], err: [%s]", podKey, err.Error())
238
+
continue
193
239
}
194
-
delete(dp.updatePodCache, podKey)
240
+
delete(dp.updatePodCache.cache, podKey)
241
+
}
242
+
ifaggregateErr!=nil {
243
+
returnfmt.Errorf("[DataPlane] error while updating pods: %w", err)
klog.Infof("[DataPlane] while updating pod, policy is referenced but does not exist. pod: [%s], policy: [%s], set [%s]", pod.PodKey, policyKey, setName)
// refreshAllPodEndpoints will refresh all the pod endpoints and create empty netpol references for new endpoints
266
+
/*
267
+
Key Assumption: a new pod event (w/ IP) cannot come before HNS knows (and can tell us) about the endpoint.
268
+
From NPM logs, it seems that endpoints are updated far earlier (several seconds) before the pod event comes in.
269
+
270
+
What we learn from refreshing endpoints:
271
+
- an old endpoint doesn't exist anymore
272
+
- a new endpoint has come up
273
+
274
+
Why not refresh when adding a netpol to all required pods?
275
+
- It's ok if we try to apply on an endpoint that doesn't exist anymore.
276
+
- We won't know the pod associated with a new endpoint even if we refresh.
277
+
278
+
Why can we refresh only once before updating all pods in the updatePodCache (see ApplyDataplane)?
279
+
- Again, it's ok if we try to apply on a non-existent endpoint.
280
+
- We won't miss the endpoint (see the assumption). At the time the pod event came in (when AddToSets/RemoveFromSets were called), HNS already knew about the endpoint.
klog.Infof("updating endpoint cache since endpoint changed for IP which never had a pod key. new endpoint: %s, old endpoint: %s, ip: %s", npmEP.id, oldNPMEP.id, npmEP.ip)
302
-
dp.endpointCache[ip] =npmEP
321
+
dp.endpointCache.cache[ip] =npmEP
303
322
} else {
304
323
npmEP.stalePodKey=&staleKey{
305
324
key: oldNPMEP.podKey,
306
325
timestamp: currentTime,
307
326
}
308
-
dp.endpointCache[ip] =npmEP
327
+
dp.endpointCache.cache[ip] =npmEP
309
328
// NOTE: TSGs rely on this log line
310
329
klog.Infof("updating endpoint cache for previously cached IP %s: %+v with stalePodKey %+v", npmEP.ip, npmEP, npmEP.stalePodKey)
311
330
}
312
331
}
313
332
}
314
333
315
334
// garbage collection for the endpoint cache
316
-
forip, ep:=rangedp.endpointCache {
335
+
forip, ep:=rangedp.endpointCache.cache {
317
336
if_, ok:=existingIPs[ip]; !ok {
318
337
ifep.podKey==unspecifiedPodKey {
319
338
ifep.stalePodKey==nil {
320
339
klog.Infof("deleting old endpoint which never had a pod key. ID: %s, IP: %s", ep.id, ip)
0 commit comments