Skip to content

Commit 025ec53

Browse files
fix: [WIN-NPM] don't remove policy from cache when updating pod (#1568)
* do not remove policy from cache when updating pod * dp windows changes
1 parent 7a369c4 commit 025ec53

File tree

7 files changed

+59
-16
lines changed

7 files changed

+59
-16
lines changed

npm/pkg/dataplane/dataplane.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ func (dp *DataPlane) RemovePolicy(policyKey string) error {
293293
return nil
294294
}
295295
// Use the endpoint list saved in cache for this network policy to remove
296-
err := dp.policyMgr.RemovePolicy(policy.PolicyKey, nil)
296+
err := dp.policyMgr.RemovePolicy(policy.PolicyKey)
297297
if err != nil {
298298
return fmt.Errorf("[DataPlane] error while removing policy: %w", err)
299299
}

npm/pkg/dataplane/dataplane_windows.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ func (dp *DataPlane) updatePod(pod *updateNPMPod) error {
169169
endpointList := map[string]string{
170170
endpoint.ip: endpoint.id,
171171
}
172-
err := dp.policyMgr.RemovePolicy(policyKey, endpointList)
172+
err := dp.policyMgr.RemovePolicyForEndpoints(policyKey, endpointList)
173173
if err != nil {
174174
return err
175175
}

npm/pkg/dataplane/policies/policymanager.go

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -130,13 +130,18 @@ func (pMgr *PolicyManager) AddPolicy(policy *NPMNetworkPolicy, endpointList map[
130130
metrics.RecordACLRuleExecTime(timer) // record execution time regardless of failure
131131
if err != nil {
132132
// NOTE: in Linux, Prometheus metrics may be off at this point since some ACL rules may have been applied successfully
133+
// In Windows, Prometheus metrics may be off at this point since we don't know how many endpoints had rules applied successfully.
133134
msg := fmt.Sprintf("failed to add policy: %s", err.Error())
134135
metrics.SendErrorLogAndMetric(util.IptmID, "error: %s", msg)
135136
return npmerrors.Errorf(npmerrors.AddPolicy, false, msg)
136137
}
137138

138139
// update Prometheus metrics on success
139-
metrics.IncNumACLRulesBy(policy.numACLRulesProducedInKernel())
140+
numEndpoints := 1
141+
if util.IsWindowsDP() {
142+
numEndpoints = len(endpointList)
143+
}
144+
metrics.IncNumACLRulesBy(policy.numACLRulesProducedInKernel() * numEndpoints)
140145

141146
pMgr.policyMap.cache[policy.PolicyKey] = policy
142147
return nil
@@ -146,7 +151,7 @@ func (pMgr *PolicyManager) isFirstPolicy() bool {
146151
return len(pMgr.policyMap.cache) == 0
147152
}
148153

149-
func (pMgr *PolicyManager) RemovePolicy(policyKey string, endpointList map[string]string) error {
154+
func (pMgr *PolicyManager) RemovePolicy(policyKey string) error {
150155
policy, ok := pMgr.GetPolicy(policyKey)
151156

152157
if !ok {
@@ -162,22 +167,57 @@ func (pMgr *PolicyManager) RemovePolicy(policyKey string, endpointList map[strin
162167
defer pMgr.policyMap.Unlock()
163168

164169
// Call actual dataplane function to apply changes
165-
err := pMgr.removePolicy(policy, endpointList)
170+
err := pMgr.removePolicy(policy, nil)
166171
// currently we only have acl rule exec time for "adding" rules, so we skip recording here
167172
if err != nil {
168-
// NOTE: in Linux, Prometheus metrics may be off at this point since some ACL rules may have been applied successfully
173+
// NOTE: in Linux, Prometheus metrics may be off at this point since some ACL rules may have been applied successfully.
174+
// In Windows, Prometheus metrics may be off at this point since we don't know how many endpoints had rules applied successfully.
169175
msg := fmt.Sprintf("failed to remove policy: %s", err.Error())
170176
metrics.SendErrorLogAndMetric(util.IptmID, "error: %s", msg)
171177
return npmerrors.Errorf(npmerrors.RemovePolicy, false, msg)
172178
}
173179

174180
// update Prometheus metrics on success
175-
metrics.DecNumACLRulesBy(policy.numACLRulesProducedInKernel())
181+
numEndpoints := 1
182+
if util.IsWindowsDP() {
183+
numEndpoints = len(policy.PodEndpoints)
184+
}
185+
metrics.DecNumACLRulesBy(policy.numACLRulesProducedInKernel() * numEndpoints)
176186

187+
// remove policy from cache
177188
delete(pMgr.policyMap.cache, policyKey)
178189
return nil
179190
}
180191

192+
// RemovePolicyForEndpoints is identical to RemovePolicy except it will not remove the policy from the cache.
193+
// This function is intended for Windows only.
194+
func (pMgr *PolicyManager) RemovePolicyForEndpoints(policyKey string, endpointList map[string]string) error {
195+
policy, ok := pMgr.GetPolicy(policyKey)
196+
197+
if !ok {
198+
return nil
199+
}
200+
201+
if len(policy.ACLs) == 0 {
202+
klog.Infof("[DataPlane] No ACLs in policy %s to remove for endpoints", policyKey)
203+
return nil
204+
}
205+
// Call actual dataplane function to apply changes
206+
err := pMgr.removePolicy(policy, endpointList)
207+
// currently we only have acl rule exec time for "adding" rules, so we skip recording here
208+
if err != nil {
209+
// NOTE: Prometheus metrics may be off at this point since we don't know how many endpoints had rules applied successfully.
210+
msg := fmt.Sprintf("failed to remove policy. endpoints: [%+v]. err: [%s]", endpointList, err.Error())
211+
metrics.SendErrorLogAndMetric(util.IptmID, "error: %s", msg)
212+
return npmerrors.Errorf(npmerrors.RemovePolicy, false, msg)
213+
}
214+
215+
// update Prometheus metrics on success
216+
metrics.DecNumACLRulesBy(policy.numACLRulesProducedInKernel() * len(endpointList))
217+
218+
return nil
219+
}
220+
181221
func (pMgr *PolicyManager) isLastPolicy() bool {
182222
// if we change our code to delete more than one policy at once, we can specify numPoliciesToDelete as an argument
183223
numPoliciesToDelete := 1

npm/pkg/dataplane/policies/policymanager_linux_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ func TestRemovePoliciesAcceptableError(t *testing.T) {
334334
defer ioshim.VerifyCalls(t, calls)
335335
pMgr := NewPolicyManager(ioshim, ipsetConfig)
336336
require.NoError(t, pMgr.AddPolicy(bothDirectionsNetPol, epList))
337-
require.NoError(t, pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey, nil))
337+
require.NoError(t, pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey))
338338
_, ok := pMgr.GetPolicy(bothDirectionsNetPol.PolicyKey)
339339
require.False(t, ok)
340340
promVals{0, 1}.testPrometheusMetrics(t)
@@ -381,7 +381,7 @@ func TestRemovePoliciesError(t *testing.T) {
381381
pMgr := NewPolicyManager(ioshim, ipsetConfig)
382382
err := pMgr.AddPolicy(bothDirectionsNetPol, nil)
383383
require.NoError(t, err)
384-
err = pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey, nil)
384+
err = pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey)
385385
require.Error(t, err)
386386

387387
promVals{6, 1}.testPrometheusMetrics(t)
@@ -407,23 +407,23 @@ func TestUpdatingStaleChains(t *testing.T) {
407407
assertStaleChainsContain(t, pMgr.staleChains)
408408

409409
// successful removal, so mark the policy's chains as stale
410-
require.NoError(t, pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey, nil))
410+
require.NoError(t, pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey))
411411
assertStaleChainsContain(t, pMgr.staleChains, bothDirectionsNetPolIngressChain, bothDirectionsNetPolEgressChain)
412412

413413
// successful add, so keep the same stale chains
414414
require.NoError(t, pMgr.AddPolicy(ingressNetPol, nil))
415415
assertStaleChainsContain(t, pMgr.staleChains, bothDirectionsNetPolIngressChain, bothDirectionsNetPolEgressChain)
416416

417417
// failure to remove, so keep the same stale chains
418-
require.Error(t, pMgr.RemovePolicy(ingressNetPol.PolicyKey, nil))
418+
require.Error(t, pMgr.RemovePolicy(ingressNetPol.PolicyKey))
419419
assertStaleChainsContain(t, pMgr.staleChains, bothDirectionsNetPolIngressChain, bothDirectionsNetPolEgressChain)
420420

421421
// successfully add a new policy. keep the same stale chains
422422
require.NoError(t, pMgr.AddPolicy(egressNetPol, nil))
423423
assertStaleChainsContain(t, pMgr.staleChains, bothDirectionsNetPolIngressChain, bothDirectionsNetPolEgressChain)
424424

425425
// successful removal, so mark the policy's chains as stale
426-
require.NoError(t, pMgr.RemovePolicy(egressNetPol.PolicyKey, nil))
426+
require.NoError(t, pMgr.RemovePolicy(egressNetPol.PolicyKey))
427427
assertStaleChainsContain(t, pMgr.staleChains, bothDirectionsNetPolIngressChain, bothDirectionsNetPolEgressChain, egressNetPolChain)
428428

429429
// failure to add, so keep the same stale chains the same

npm/pkg/dataplane/policies/policymanager_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ func TestRemovePolicy(t *testing.T) {
172172
defer ioshim.VerifyCalls(t, calls)
173173
pMgr := NewPolicyManager(ioshim, ipsetConfig)
174174
require.NoError(t, pMgr.AddPolicy(testNetPol, epList))
175-
require.NoError(t, pMgr.RemovePolicy(testNetPol.PolicyKey, nil))
175+
require.NoError(t, pMgr.RemovePolicy(testNetPol.PolicyKey))
176176
_, ok := pMgr.GetPolicy(testNetPol.PolicyKey)
177177
require.False(t, ok)
178178
promVals{0, 1}.testPrometheusMetrics(t)
@@ -183,7 +183,7 @@ func TestRemoveNonexistentPolicy(t *testing.T) {
183183
metrics.ReinitializeAll()
184184
ioshim := common.NewMockIOShim(nil)
185185
pMgr := NewPolicyManager(ioshim, ipsetConfig)
186-
require.NoError(t, pMgr.RemovePolicy("wrong-policy-key", epList))
186+
require.NoError(t, pMgr.RemovePolicy("wrong-policy-key"))
187187
promVals{0, 0}.testPrometheusMetrics(t)
188188
}
189189

npm/pkg/dataplane/policies/policymanager_windows.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ func (pMgr *PolicyManager) removePolicy(policy *NPMNetworkPolicy, endpointList m
144144
if err != nil {
145145
return err
146146
}
147+
// FIXME rulesToRemove is a list of pointers
147148
klog.Infof("[PolicyManagerWindows] To Remove Policy: %s \n To Delete ACLs: %+v \n To Remove From %+v endpoints", policy.PolicyKey, rulesToRemove, endpointList)
148149
// If remove bug is solved we can directly remove the exact policy from the endpoint
149150
// but if the bug is not solved then get all existing policies and remove relevant policies from list
@@ -204,6 +205,7 @@ func (pMgr *PolicyManager) removePolicyByEndpointID(ruleID, epID string, noOfRul
204205
return nil
205206
}
206207
}
208+
// FIXME epBuilder.aclPolicies is a list of pointers
207209
klog.Infof("[DataPlanewindows] Epbuilder ACL policies before removing %+v", epBuilder.aclPolicies)
208210
klog.Infof("[DataPlanewindows] Epbuilder Other policies before removing %+v", epBuilder.otherPolicies)
209211
epPolicies, err := epBuilder.getHCNPolicyRequest()
@@ -246,6 +248,7 @@ func getEPPolicyReqFromACLSettings(settings []*NPMACLPolSettings) (hcn.PolicyEnd
246248
}
247249

248250
for i, acl := range settings {
251+
// FIXME a lot of prints
249252
klog.Infof("Acl settings: %+v", acl)
250253
byteACL, err := json.Marshal(acl)
251254
if err != nil {

npm/pkg/dataplane/policies/policymanager_windows_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ func TestRemovePolicies(t *testing.T) {
124124
verifyFakeHNSCacheACLs(t, expectedACLs, acls)
125125
}
126126

127-
err = pMgr.RemovePolicy(TestNetworkPolicies[0].PolicyKey, nil)
127+
err = pMgr.RemovePolicy(TestNetworkPolicies[0].PolicyKey)
128128
require.NoError(t, err)
129129
verifyACLCacheIsCleaned(t, hns, len(endPointIDList))
130130
}
@@ -150,7 +150,7 @@ func TestRemovePoliciesEndpointNotFound(t *testing.T) {
150150
testendPointIDList := map[string]string{
151151
"10.0.0.5": "test10",
152152
}
153-
err = pMgr.RemovePolicy(TestNetworkPolicies[0].PolicyKey, testendPointIDList)
153+
err = pMgr.RemovePolicyForEndpoints(TestNetworkPolicies[0].PolicyKey, testendPointIDList)
154154
require.NoError(t, err, err)
155155
verifyACLCacheIsCleaned(t, hns, len(endPointIDList))
156156
}

0 commit comments

Comments
 (0)