Skip to content

Commit 7db16e8

Browse files
ignore endpoint not found errors and clean up logs (#1440)
1 parent 618bf29 commit 7db16e8

File tree

1 file changed

+57
-38
lines changed

1 file changed

+57
-38
lines changed

npm/pkg/dataplane/policies/policymanager_windows.go

Lines changed: 57 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,15 @@ import (
66
"fmt"
77
"strings"
88

9+
"github.com/Azure/azure-container-networking/npm/metrics"
10+
"github.com/Azure/azure-container-networking/npm/util"
911
"github.com/Microsoft/hcsshim/hcn"
1012
"k8s.io/klog"
1113
)
1214

1315
var (
14-
ErrFailedMarshalACLSettings = errors.New("Failed to marshal ACL settings")
15-
ErrFailedUnMarshalACLSettings = errors.New("Failed to unmarshal ACL settings")
16+
ErrFailedMarshalACLSettings = errors.New("failed to marshal ACL settings")
17+
ErrFailedUnMarshalACLSettings = errors.New("failed to unmarshal ACL settings")
1618
resetAllACLs shouldResetAllACLs = true
1719
removeOnlyGivenPolicy shouldResetAllACLs = false
1820
)
@@ -35,11 +37,19 @@ func (pMgr *PolicyManager) bootup(epIDs []string) error {
3537
for _, epID := range epIDs {
3638
err := pMgr.removePolicyByEndpointID("", epID, 0, resetAllACLs)
3739
if err != nil {
38-
aggregateErr = fmt.Errorf("[PolicyManagerWindows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr)
40+
if aggregateErr == nil {
41+
aggregateErr = fmt.Errorf("skipping resetting policies on %s ID Endpoint with err: %w", epID, err)
42+
} else {
43+
aggregateErr = fmt.Errorf("skipping resetting policies on %s ID Endpoint with err: %s. previous err: [%w]", epID, err.Error(), aggregateErr)
44+
}
3945
continue
4046
}
4147
}
42-
return aggregateErr
48+
49+
if aggregateErr != nil {
50+
return fmt.Errorf("[PolicyManagerWindows] %w", aggregateErr)
51+
}
52+
return nil
4353
}
4454

4555
func (pMgr *PolicyManager) reconcile() {
@@ -99,17 +109,24 @@ func (pMgr *PolicyManager) addPolicy(policy *NPMNetworkPolicy, endpointList map[
99109
for epIP, epID := range endpointList {
100110
err = pMgr.applyPoliciesToEndpointID(epID, epPolicyRequest)
101111
if err != nil {
102-
klog.Infof("[PolicyManagerWindows] Failed to add policy on %s ID Endpoint with %s err", epID, err.Error())
112+
klog.Errorf("failed to add policy to kernel. policy %s, endpoint: %s, err: %s", policy.PolicyKey, epID, err.Error())
103113
// Do not return if one endpoint fails, try all endpoints.
104114
// aggregate the error message and return it at the end
105-
aggregateErr = fmt.Errorf("Failed to add policy on %s ID Endpoint with %s err \n Previous %w", epID, err.Error(), aggregateErr)
115+
if aggregateErr == nil {
116+
aggregateErr = fmt.Errorf("failed to add policy on %s ID Endpoint with err: %w", epID, err)
117+
} else {
118+
aggregateErr = fmt.Errorf("failed to add policy on %s ID Endpoint with err: %s. previous err: [%w]", epID, err.Error(), aggregateErr)
119+
}
106120
continue
107121
}
108122
// Now update policy cache to reflect new endpoint
109123
policy.PodEndpoints[epIP] = epID
110124
}
111125

112-
return aggregateErr
126+
if aggregateErr != nil {
127+
return fmt.Errorf("[PolicyManagerWindows] %w", aggregateErr)
128+
}
129+
return nil
113130
}
114131

115132
// removePolicy will remove the policy from the specified endpoints, or
@@ -136,29 +153,41 @@ func (pMgr *PolicyManager) removePolicy(policy *NPMNetworkPolicy, endpointList m
136153
for epIPAddr, epID := range endpointList {
137154
err := pMgr.removePolicyByEndpointID(rulesToRemove[0].Id, epID, numOfRulesToRemove, removeOnlyGivenPolicy)
138155
if err != nil {
139-
aggregateErr = fmt.Errorf("[PolicyManagerWindows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr)
156+
if aggregateErr == nil {
157+
aggregateErr = fmt.Errorf("skipping removing policy on %s ID Endpoint with err: %w", epID, err)
158+
} else {
159+
aggregateErr = fmt.Errorf("skipping removing policy on %s ID Endpoint with err: %s. previous err: [%w]", epID, err.Error(), aggregateErr)
160+
}
140161
continue
141162
}
142163

143164
// Delete podendpoint from policy cache
144165
delete(policy.PodEndpoints, epIPAddr)
145166
}
146167

147-
return aggregateErr
168+
if aggregateErr != nil {
169+
return fmt.Errorf("[PolicyManagerWindows] while removing policy %s, %w", policy.PolicyKey, aggregateErr)
170+
}
171+
return nil
148172
}
149173

150174
func (pMgr *PolicyManager) removePolicyByEndpointID(ruleID, epID string, noOfRulesToRemove int, resetAllACL shouldResetAllACLs) error {
151-
epObj, err := pMgr.getEndpointByID(epID)
175+
epObj, err := pMgr.ioShim.Hns.GetEndpointByID(epID)
152176
if err != nil {
153-
return fmt.Errorf("[PolicyManagerWindows] Skipping removing policies on %s ID Endpoint with %s err", epID, err.Error())
177+
if isNotFoundErr(err) {
178+
klog.Infof("[PolicyManagerWindows] ignoring remove policy on endpoint since the endpoint wasn't found. the corresponding pod was most likely deleted. policy: %s, endpoint: %s", ruleID, epID)
179+
return nil
180+
}
181+
return fmt.Errorf("[PolicyManagerWindows] failed to remove policy while getting the endpoint. policy: %s, endpoint: %s, err: %w", ruleID, epID, err)
154182
}
183+
155184
if len(epObj.Policies) == 0 {
156185
klog.Infof("[DataPlanewindows] No Policies to remove on %s ID Endpoint", epID)
157186
}
158187

159188
epBuilder, err := splitEndpointPolicies(epObj.Policies)
160189
if err != nil {
161-
return fmt.Errorf("[PolicyManagerWindows] Skipping removing policies on %s ID Endpoint with %s err", epID, err.Error())
190+
return fmt.Errorf("couldn't split endpoint policies while trying to remove policy. policy: %s, endpoint: %s, err: %s", ruleID, epID, err.Error())
162191
}
163192

164193
if resetAllACL {
@@ -178,51 +207,36 @@ func (pMgr *PolicyManager) removePolicyByEndpointID(ruleID, epID string, noOfRul
178207
klog.Infof("[DataPlanewindows] Epbuilder Other policies before removing %+v", epBuilder.otherPolicies)
179208
epPolicies, err := epBuilder.getHCNPolicyRequest()
180209
if err != nil {
181-
return fmt.Errorf("[DataPlanewindows] Skipping removing policies on %s ID Endpoint with %s err", epID, err.Error())
210+
return fmt.Errorf("unable to get HCN policy request while trying to remove policy. policy: %s, endpoint: %s, err: %s", ruleID, epID, err.Error())
182211
}
183212

184-
err = pMgr.updatePoliciesOnEndpoint(epObj, epPolicies)
213+
err = pMgr.ioShim.Hns.ApplyEndpointPolicy(epObj, hcn.RequestTypeUpdate, epPolicies)
185214
if err != nil {
186-
return fmt.Errorf("[DataPlanewindows] Skipping removing policies on %s ID Endpoint with %s err", epID, err.Error())
215+
return fmt.Errorf("unable to apply changes when removing policy. policy: %s, endpoint: %s, err: %w", ruleID, epID, err)
187216
}
188217
return nil
189218
}
190219

191220
// addEPPolicyWithEpID given an EP ID and a list of policies, add the policies to the endpoint
192221
func (pMgr *PolicyManager) applyPoliciesToEndpointID(epID string, policies hcn.PolicyEndpointRequest) error {
193-
epObj, err := pMgr.getEndpointByID(epID)
222+
epObj, err := pMgr.ioShim.Hns.GetEndpointByID(epID)
194223
if err != nil {
195-
klog.Infof("[PolicyManagerWindows] Skipping applying policies %s ID Endpoint with %s err", epID, err.Error())
196-
return err
224+
if isNotFoundErr(err) {
225+
// unlikely scenario where an endpoint is deleted right after we refresh HNS endpoints, or an unlikely scenario where an endpoint is deleted right after we refresh HNS endpoints
226+
metrics.SendErrorLogAndMetric(util.IptmID, "[PolicyManagerWindows] ignoring apply policies to endpoint since the endpoint wasn't found. endpoint: %s", epID)
227+
return nil
228+
}
229+
return fmt.Errorf("[PolicyManagerWindows] to apply policies while getting the endpoint. endpoint: %s, err: %w", epID, err)
197230
}
198231

199232
err = pMgr.ioShim.Hns.ApplyEndpointPolicy(epObj, hcn.RequestTypeAdd, policies)
200233
if err != nil {
201-
klog.Infof("[PolicyManagerWindows]Failed to apply policies on %s ID Endpoint with %s err", epID, err.Error())
202-
return err
203-
}
204-
return nil
205-
}
206-
207-
// addEPPolicyWithEpID given an EP ID and a list of policies, add the policies to the endpoint
208-
func (pMgr *PolicyManager) updatePoliciesOnEndpoint(epObj *hcn.HostComputeEndpoint, policies hcn.PolicyEndpointRequest) error {
209-
err := pMgr.ioShim.Hns.ApplyEndpointPolicy(epObj, hcn.RequestTypeUpdate, policies)
210-
if err != nil {
211-
klog.Infof("[PolicyManagerWindows]Failed to update/remove policies on %s ID Endpoint with %s err", epObj.Id, err.Error())
234+
klog.Errorf("[PolicyManagerWindows] failed to apply policies. endpoint: %s, err: %s", epID, err.Error())
212235
return err
213236
}
214237
return nil
215238
}
216239

217-
func (pMgr *PolicyManager) getEndpointByID(id string) (*hcn.HostComputeEndpoint, error) {
218-
epObj, err := pMgr.ioShim.Hns.GetEndpointByID(id)
219-
if err != nil {
220-
klog.Infof("[PolicyManagerWindows] Failed to get EndPoint object of %s ID from HNS", id)
221-
return nil, err
222-
}
223-
return epObj, nil
224-
}
225-
226240
// getEPPolicyReqFromACLSettings converts given ACLSettings into PolicyEndpointRequest
227241
func getEPPolicyReqFromACLSettings(settings []*NPMACLPolSettings) (hcn.PolicyEndpointRequest, error) {
228242
policyToAdd := hcn.PolicyEndpointRequest{
@@ -360,3 +374,8 @@ func (epBuilder *endpointPolicyBuilder) removeACLPolicyAtIndex(indexes map[int]s
360374
}
361375
epBuilder.aclPolicies = tempAclPolicies
362376
}
377+
378+
func isNotFoundErr(err error) bool {
379+
var notFoundErr hcn.EndpointNotFoundError
380+
return errors.As(err, &notFoundErr)
381+
}

0 commit comments

Comments
 (0)