@@ -6,13 +6,15 @@ import (
66 "fmt"
77 "strings"
88
9+ "github.com/Azure/azure-container-networking/npm/metrics"
10+ "github.com/Azure/azure-container-networking/npm/util"
911 "github.com/Microsoft/hcsshim/hcn"
1012 "k8s.io/klog"
1113)
1214
1315var (
14- ErrFailedMarshalACLSettings = errors .New ("Failed to marshal ACL settings" )
15- ErrFailedUnMarshalACLSettings = errors .New ("Failed to unmarshal ACL settings" )
16+ ErrFailedMarshalACLSettings = errors .New ("failed to marshal ACL settings" )
17+ ErrFailedUnMarshalACLSettings = errors .New ("failed to unmarshal ACL settings" )
1618 resetAllACLs shouldResetAllACLs = true
1719 removeOnlyGivenPolicy shouldResetAllACLs = false
1820)
@@ -35,11 +37,19 @@ func (pMgr *PolicyManager) bootup(epIDs []string) error {
3537 for _ , epID := range epIDs {
3638 err := pMgr .removePolicyByEndpointID ("" , epID , 0 , resetAllACLs )
3739 if err != nil {
38- aggregateErr = fmt .Errorf ("[PolicyManagerWindows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w" , epID , err .Error (), aggregateErr )
40+ if aggregateErr == nil {
41+ aggregateErr = fmt .Errorf ("skipping resetting policies on %s ID Endpoint with err: %w" , epID , err )
42+ } else {
43+ aggregateErr = fmt .Errorf ("skipping resetting policies on %s ID Endpoint with err: %s. previous err: [%w]" , epID , err .Error (), aggregateErr )
44+ }
3945 continue
4046 }
4147 }
42- return aggregateErr
48+
49+ if aggregateErr != nil {
50+ return fmt .Errorf ("[PolicyManagerWindows] %w" , aggregateErr )
51+ }
52+ return nil
4353}
4454
4555func (pMgr * PolicyManager ) reconcile () {
@@ -99,17 +109,24 @@ func (pMgr *PolicyManager) addPolicy(policy *NPMNetworkPolicy, endpointList map[
99109 for epIP , epID := range endpointList {
100110 err = pMgr .applyPoliciesToEndpointID (epID , epPolicyRequest )
101111 if err != nil {
102- klog .Infof ( "[PolicyManagerWindows] Failed to add policy on %s ID Endpoint with %s err" , epID , err .Error ())
112+ klog .Errorf ( "failed to add policy to kernel. policy %s, endpoint: %s, err: %s" , policy . PolicyKey , epID , err .Error ())
103113 // Do not return if one endpoint fails, try all endpoints.
104114 // aggregate the error message and return it at the end
105- aggregateErr = fmt .Errorf ("Failed to add policy on %s ID Endpoint with %s err \n Previous %w" , epID , err .Error (), aggregateErr )
115+ if aggregateErr == nil {
116+ aggregateErr = fmt .Errorf ("failed to add policy on %s ID Endpoint with err: %w" , epID , err )
117+ } else {
118+ aggregateErr = fmt .Errorf ("failed to add policy on %s ID Endpoint with err: %s. previous err: [%w]" , epID , err .Error (), aggregateErr )
119+ }
106120 continue
107121 }
108122 // Now update policy cache to reflect new endpoint
109123 policy .PodEndpoints [epIP ] = epID
110124 }
111125
112- return aggregateErr
126+ if aggregateErr != nil {
127+ return fmt .Errorf ("[PolicyManagerWindows] %w" , aggregateErr )
128+ }
129+ return nil
113130}
114131
115132// removePolicy will remove the policy from the specified endpoints, or
@@ -136,29 +153,41 @@ func (pMgr *PolicyManager) removePolicy(policy *NPMNetworkPolicy, endpointList m
136153 for epIPAddr , epID := range endpointList {
137154 err := pMgr .removePolicyByEndpointID (rulesToRemove [0 ].Id , epID , numOfRulesToRemove , removeOnlyGivenPolicy )
138155 if err != nil {
139- aggregateErr = fmt .Errorf ("[PolicyManagerWindows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w" , epID , err .Error (), aggregateErr )
156+ if aggregateErr == nil {
157+ aggregateErr = fmt .Errorf ("skipping removing policy on %s ID Endpoint with err: %w" , epID , err )
158+ } else {
159+ aggregateErr = fmt .Errorf ("skipping removing policy on %s ID Endpoint with err: %s. previous err: [%w]" , epID , err .Error (), aggregateErr )
160+ }
140161 continue
141162 }
142163
143164 // Delete podendpoint from policy cache
144165 delete (policy .PodEndpoints , epIPAddr )
145166 }
146167
147- return aggregateErr
168+ if aggregateErr != nil {
169+ return fmt .Errorf ("[PolicyManagerWindows] while removing policy %s, %w" , policy .PolicyKey , aggregateErr )
170+ }
171+ return nil
148172}
149173
150174func (pMgr * PolicyManager ) removePolicyByEndpointID (ruleID , epID string , noOfRulesToRemove int , resetAllACL shouldResetAllACLs ) error {
151- epObj , err := pMgr .getEndpointByID (epID )
175+ epObj , err := pMgr .ioShim . Hns . GetEndpointByID (epID )
152176 if err != nil {
153- return fmt .Errorf ("[PolicyManagerWindows] Skipping removing policies on %s ID Endpoint with %s err" , epID , err .Error ())
177+ if isNotFoundErr (err ) {
178+ klog .Infof ("[PolicyManagerWindows] ignoring remove policy on endpoint since the endpoint wasn't found. the corresponding pod was most likely deleted. policy: %s, endpoint: %s" , ruleID , epID )
179+ return nil
180+ }
181+ return fmt .Errorf ("[PolicyManagerWindows] failed to remove policy while getting the endpoint. policy: %s, endpoint: %s, err: %w" , ruleID , epID , err )
154182 }
183+
155184 if len (epObj .Policies ) == 0 {
156185 klog .Infof ("[DataPlanewindows] No Policies to remove on %s ID Endpoint" , epID )
157186 }
158187
159188 epBuilder , err := splitEndpointPolicies (epObj .Policies )
160189 if err != nil {
161- return fmt .Errorf ("[PolicyManagerWindows] Skipping removing policies on %s ID Endpoint with %s err" , epID , err .Error ())
190+ return fmt .Errorf ("couldn't split endpoint policies while trying to remove policy. policy: %s, endpoint: %s, err: %s" , ruleID , epID , err .Error ())
162191 }
163192
164193 if resetAllACL {
@@ -178,51 +207,36 @@ func (pMgr *PolicyManager) removePolicyByEndpointID(ruleID, epID string, noOfRul
178207 klog .Infof ("[DataPlanewindows] Epbuilder Other policies before removing %+v" , epBuilder .otherPolicies )
179208 epPolicies , err := epBuilder .getHCNPolicyRequest ()
180209 if err != nil {
181- return fmt .Errorf ("[DataPlanewindows] Skipping removing policies on %s ID Endpoint with %s err" , epID , err .Error ())
210+ return fmt .Errorf ("unable to get HCN policy request while trying to remove policy. policy: %s, endpoint: %s, err: %s" , ruleID , epID , err .Error ())
182211 }
183212
184- err = pMgr .updatePoliciesOnEndpoint (epObj , epPolicies )
213+ err = pMgr .ioShim . Hns . ApplyEndpointPolicy (epObj , hcn . RequestTypeUpdate , epPolicies )
185214 if err != nil {
186- return fmt .Errorf ("[DataPlanewindows] Skipping removing policies on %s ID Endpoint with %s err" , epID , err . Error () )
215+ return fmt .Errorf ("unable to apply changes when removing policy. policy: %s, endpoint: %s, err: %w " , ruleID , epID , err )
187216 }
188217 return nil
189218}
190219
191220// addEPPolicyWithEpID given an EP ID and a list of policies, add the policies to the endpoint
192221func (pMgr * PolicyManager ) applyPoliciesToEndpointID (epID string , policies hcn.PolicyEndpointRequest ) error {
193- epObj , err := pMgr .getEndpointByID (epID )
222+ epObj , err := pMgr .ioShim . Hns . GetEndpointByID (epID )
194223 if err != nil {
195- klog .Infof ("[PolicyManagerWindows] Skipping applying policies %s ID Endpoint with %s err" , epID , err .Error ())
196- return err
224+ if isNotFoundErr (err ) {
225+ // unlikely scenario where an endpoint is deleted right after we refresh HNS endpoints, or an unlikely scenario where an endpoint is deleted right after we refresh HNS endpoints
226+ metrics .SendErrorLogAndMetric (util .IptmID , "[PolicyManagerWindows] ignoring apply policies to endpoint since the endpoint wasn't found. endpoint: %s" , epID )
227+ return nil
228+ }
229+ return fmt .Errorf ("[PolicyManagerWindows] to apply policies while getting the endpoint. endpoint: %s, err: %w" , epID , err )
197230 }
198231
199232 err = pMgr .ioShim .Hns .ApplyEndpointPolicy (epObj , hcn .RequestTypeAdd , policies )
200233 if err != nil {
201- klog .Infof ("[PolicyManagerWindows]Failed to apply policies on %s ID Endpoint with %s err" , epID , err .Error ())
202- return err
203- }
204- return nil
205- }
206-
207- // addEPPolicyWithEpID given an EP ID and a list of policies, add the policies to the endpoint
208- func (pMgr * PolicyManager ) updatePoliciesOnEndpoint (epObj * hcn.HostComputeEndpoint , policies hcn.PolicyEndpointRequest ) error {
209- err := pMgr .ioShim .Hns .ApplyEndpointPolicy (epObj , hcn .RequestTypeUpdate , policies )
210- if err != nil {
211- klog .Infof ("[PolicyManagerWindows]Failed to update/remove policies on %s ID Endpoint with %s err" , epObj .Id , err .Error ())
234+ klog .Errorf ("[PolicyManagerWindows] failed to apply policies. endpoint: %s, err: %s" , epID , err .Error ())
212235 return err
213236 }
214237 return nil
215238}
216239
217- func (pMgr * PolicyManager ) getEndpointByID (id string ) (* hcn.HostComputeEndpoint , error ) {
218- epObj , err := pMgr .ioShim .Hns .GetEndpointByID (id )
219- if err != nil {
220- klog .Infof ("[PolicyManagerWindows] Failed to get EndPoint object of %s ID from HNS" , id )
221- return nil , err
222- }
223- return epObj , nil
224- }
225-
226240// getEPPolicyReqFromACLSettings converts given ACLSettings into PolicyEndpointRequest
227241func getEPPolicyReqFromACLSettings (settings []* NPMACLPolSettings ) (hcn.PolicyEndpointRequest , error ) {
228242 policyToAdd := hcn.PolicyEndpointRequest {
@@ -360,3 +374,8 @@ func (epBuilder *endpointPolicyBuilder) removeACLPolicyAtIndex(indexes map[int]s
360374 }
361375 epBuilder .aclPolicies = tempAclPolicies
362376}
377+
378+ func isNotFoundErr (err error ) bool {
379+ var notFoundErr hcn.EndpointNotFoundError
380+ return errors .As (err , & notFoundErr )
381+ }
0 commit comments