@@ -44,6 +44,7 @@ import (
4444 "github.com/Azure/azure-container-networking/platform"
4545 localtls "github.com/Azure/azure-container-networking/server/tls"
4646 "github.com/Azure/azure-container-networking/store"
47+ "github.com/avast/retry-go/v3"
4748 "github.com/pkg/errors"
4849 "k8s.io/apimachinery/pkg/types"
4950 ctrl "sigs.k8s.io/controller-runtime"
@@ -60,6 +61,7 @@ const (
6061
6162 // 720 * acn.FiveSeconds sec sleeps = 1Hr
6263 maxRetryNodeRegister = 720
64+ initCNSInitalDelay = 10 * time .Second
6365)
6466
6567var (
@@ -308,57 +310,51 @@ func registerNode(httpc *http.Client, httpRestService cns.HTTPService, dncEP, in
308310 nodeRegisterRequest .NmAgentSupportedApis = supportedApis
309311
310312 // CNS tries to register Node for maximum of an hour.
311- for tryNum := 0 ; tryNum <= maxRetryNodeRegister ; tryNum ++ {
312- success , err := sendRegisterNodeRequest (httpc , httpRestService , nodeRegisterRequest , url )
313- if err != nil {
314- return err
315- }
316- if success {
317- return nil
318- }
319- time .Sleep (acn .FiveSeconds )
320- }
321- return fmt .Errorf ("[Azure CNS] Failed to register node %s after maximum reties for an hour with Infrastructure Network: %s PrivateEndpoint: %s" ,
322- nodeID , infraVnet , dncEP )
313+ err := retry .Do (func () error {
314+ return sendRegisterNodeRequest (httpc , httpRestService , nodeRegisterRequest , url )
315+ }, retry .Delay (acn .FiveSeconds ), retry .Attempts (maxRetryNodeRegister ), retry .DelayType (retry .FixedDelay ))
316+
317+ return errors .Wrap (err , fmt .Sprintf ("[Azure CNS] Failed to register node %s after maximum reties for an hour with Infrastructure Network: %s PrivateEndpoint: %s" ,
318+ nodeID , infraVnet , dncEP ))
323319}
324320
325321// sendRegisterNodeRequest func helps in registering the node until there is an error.
326322func sendRegisterNodeRequest (
327323 httpc * http.Client ,
328324 httpRestService cns.HTTPService ,
329325 nodeRegisterRequest cns.NodeRegisterRequest ,
330- registerURL string ) ( bool , error ) {
326+ registerURL string ) error {
331327
332328 var body bytes.Buffer
333329 err := json .NewEncoder (& body ).Encode (nodeRegisterRequest )
334330 if err != nil {
335331 log .Errorf ("[Azure CNS] Failed to register node while encoding json failed with non-retriable err %v" , err )
336- return false , err
332+ return errors . Wrap ( retry . Unrecoverable ( err ), "failed to sendRegisterNodeRequest" )
337333 }
338334
339335 response , err := httpc .Post (registerURL , "application/json" , & body )
340336 if err != nil {
341337 logger .Errorf ("[Azure CNS] Failed to register node with retriable err: %+v" , err )
342- return false , nil
338+ return errors . Wrap ( err , "failed to sendRegisterNodeRequest" )
343339 }
344340 defer response .Body .Close ()
345341
346342 if response .StatusCode != http .StatusCreated {
347343 err = fmt .Errorf ("[Azure CNS] Failed to register node, DNC replied with http status code %s" , strconv .Itoa (response .StatusCode ))
348344 logger .Errorf (err .Error ())
349- return false , nil
345+ return errors . Wrap ( err , "failed to sendRegisterNodeRequest" )
350346 }
351347
352348 var req cns.SetOrchestratorTypeRequest
353349 err = json .NewDecoder (response .Body ).Decode (& req )
354350 if err != nil {
355351 log .Errorf ("[Azure CNS] decoding Node Resgister response json failed with err %v" , err )
356- return false , nil
352+ return errors . Wrap ( err , "failed to sendRegisterNodeRequest" )
357353 }
358354 httpRestService .SetNodeOrchestrator (& req )
359355
360356 logger .Printf ("[Azure CNS] Node Registered" )
361- return true , nil
357+ return nil
362358}
363359
364360// Main is the entry point for CNS.
@@ -884,9 +880,18 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
884880 }
885881 }()
886882
887- err = initCNS (ctx , scopedcli , httpRestServiceImplementation )
888- if err != nil {
883+ // apiserver nnc might not be registered or api server might be down and crashloop backof puts us outside of 5-10 minutes we have for
884+ // aks addons to come up so retry a bit more aggresively here.
885+ // will retry 10 times maxing out at a minute taking about 8 minutes before it gives up.
886+ err = retry .Do (func () error {
887+ err = initCNS (ctx , scopedcli , httpRestServiceImplementation )
888+ if err != nil {
889+ logger .Errorf ("[Azure CNS] Failed to init cns with err: %v" , err )
890+ }
889891 return errors .Wrap (err , "failed to initialize CNS state" )
892+ }, retry .Context (ctx ), retry .Delay (initCNSInitalDelay ), retry .MaxDelay (time .Minute ))
893+ if err != nil {
894+ return err
890895 }
891896
892897 manager , err := ctrl .NewManager (kubeConfig , ctrl.Options {
0 commit comments