Skip to content

Commit 27ac431

Browse files
authored
Use avast to retry init cns and register node. (#1087)
* stupid simple retry * go lint fixes * missed one // * avast retry * try out avast * vendor * try nad make linters happy * fix wrap check
1 parent fa8f665 commit 27ac431

File tree

97 files changed

+5322
-101
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

97 files changed

+5322
-101
lines changed

cns/service/main.go

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ import (
4444
"github.com/Azure/azure-container-networking/platform"
4545
localtls "github.com/Azure/azure-container-networking/server/tls"
4646
"github.com/Azure/azure-container-networking/store"
47+
"github.com/avast/retry-go/v3"
4748
"github.com/pkg/errors"
4849
"k8s.io/apimachinery/pkg/types"
4950
ctrl "sigs.k8s.io/controller-runtime"
@@ -60,6 +61,7 @@ const (
6061

6162
// 720 * acn.FiveSeconds sec sleeps = 1Hr
6263
maxRetryNodeRegister = 720
64+
initCNSInitalDelay = 10 * time.Second
6365
)
6466

6567
var (
@@ -308,57 +310,51 @@ func registerNode(httpc *http.Client, httpRestService cns.HTTPService, dncEP, in
308310
nodeRegisterRequest.NmAgentSupportedApis = supportedApis
309311

310312
// CNS tries to register Node for maximum of an hour.
311-
for tryNum := 0; tryNum <= maxRetryNodeRegister; tryNum++ {
312-
success, err := sendRegisterNodeRequest(httpc, httpRestService, nodeRegisterRequest, url)
313-
if err != nil {
314-
return err
315-
}
316-
if success {
317-
return nil
318-
}
319-
time.Sleep(acn.FiveSeconds)
320-
}
321-
return fmt.Errorf("[Azure CNS] Failed to register node %s after maximum reties for an hour with Infrastructure Network: %s PrivateEndpoint: %s",
322-
nodeID, infraVnet, dncEP)
313+
err := retry.Do(func() error {
314+
return sendRegisterNodeRequest(httpc, httpRestService, nodeRegisterRequest, url)
315+
}, retry.Delay(acn.FiveSeconds), retry.Attempts(maxRetryNodeRegister), retry.DelayType(retry.FixedDelay))
316+
317+
return errors.Wrap(err, fmt.Sprintf("[Azure CNS] Failed to register node %s after maximum reties for an hour with Infrastructure Network: %s PrivateEndpoint: %s",
318+
nodeID, infraVnet, dncEP))
323319
}
324320

325321
// sendRegisterNodeRequest func helps in registering the node until there is an error.
326322
func sendRegisterNodeRequest(
327323
httpc *http.Client,
328324
httpRestService cns.HTTPService,
329325
nodeRegisterRequest cns.NodeRegisterRequest,
330-
registerURL string) (bool, error) {
326+
registerURL string) error {
331327

332328
var body bytes.Buffer
333329
err := json.NewEncoder(&body).Encode(nodeRegisterRequest)
334330
if err != nil {
335331
log.Errorf("[Azure CNS] Failed to register node while encoding json failed with non-retriable err %v", err)
336-
return false, err
332+
return errors.Wrap(retry.Unrecoverable(err), "failed to sendRegisterNodeRequest")
337333
}
338334

339335
response, err := httpc.Post(registerURL, "application/json", &body)
340336
if err != nil {
341337
logger.Errorf("[Azure CNS] Failed to register node with retriable err: %+v", err)
342-
return false, nil
338+
return errors.Wrap(err, "failed to sendRegisterNodeRequest")
343339
}
344340
defer response.Body.Close()
345341

346342
if response.StatusCode != http.StatusCreated {
347343
err = fmt.Errorf("[Azure CNS] Failed to register node, DNC replied with http status code %s", strconv.Itoa(response.StatusCode))
348344
logger.Errorf(err.Error())
349-
return false, nil
345+
return errors.Wrap(err, "failed to sendRegisterNodeRequest")
350346
}
351347

352348
var req cns.SetOrchestratorTypeRequest
353349
err = json.NewDecoder(response.Body).Decode(&req)
354350
if err != nil {
355351
log.Errorf("[Azure CNS] decoding Node Resgister response json failed with err %v", err)
356-
return false, nil
352+
return errors.Wrap(err, "failed to sendRegisterNodeRequest")
357353
}
358354
httpRestService.SetNodeOrchestrator(&req)
359355

360356
logger.Printf("[Azure CNS] Node Registered")
361-
return true, nil
357+
return nil
362358
}
363359

364360
// Main is the entry point for CNS.
@@ -884,9 +880,18 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
884880
}
885881
}()
886882

887-
err = initCNS(ctx, scopedcli, httpRestServiceImplementation)
888-
if err != nil {
883+
// apiserver nnc might not be registered or api server might be down and crashloop backof puts us outside of 5-10 minutes we have for
884+
// aks addons to come up so retry a bit more aggresively here.
885+
// will retry 10 times maxing out at a minute taking about 8 minutes before it gives up.
886+
err = retry.Do(func() error {
887+
err = initCNS(ctx, scopedcli, httpRestServiceImplementation)
888+
if err != nil {
889+
logger.Errorf("[Azure CNS] Failed to init cns with err: %v", err)
890+
}
889891
return errors.Wrap(err, "failed to initialize CNS state")
892+
}, retry.Context(ctx), retry.Delay(initCNSInitalDelay), retry.MaxDelay(time.Minute))
893+
if err != nil {
894+
return err
890895
}
891896

892897
manager, err := ctrl.NewManager(kubeConfig, ctrl.Options{

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ require (
4848
)
4949

5050
require (
51+
github.com/avast/retry-go v3.0.0+incompatible
52+
github.com/avast/retry-go/v3 v3.1.1
5153
github.com/beorn7/perks v1.0.1 // indirect
5254
github.com/cespare/xxhash/v2 v2.1.1 // indirect
5355
github.com/containerd/cgroups v1.0.1 // indirect

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5
7474
github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY=
7575
github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8=
7676
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
77+
github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0=
78+
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
79+
github.com/avast/retry-go/v3 v3.1.1 h1:49Scxf4v8PmiQ/nY0aY3p0hDueqSmc7++cBbtiDGu2g=
80+
github.com/avast/retry-go/v3 v3.1.1/go.mod h1:6cXRK369RpzFL3UQGqIUp9Q7GDrams+KsYWrfNA1/nQ=
7781
github.com/benbjohnson/clock v1.0.3/go.mod h1:bGMdMPoPVvcYyt1gHDf4J2KE153Yf9BuiUKYMaxlTDM=
7882
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
7983
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=

vendor/github.com/Microsoft/go-winio/go.mod

Lines changed: 9 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/Microsoft/go-winio/go.sum

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/Microsoft/hcsshim/go.mod

Lines changed: 30 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)