Skip to content

Commit d61a128

Browse files
authored
feat: add metric for NNC init failures (#3453)
Signed-off-by: Evan Baker <[email protected]>
1 parent 188f0bc commit d61a128

File tree

2 files changed

+43
-27
lines changed

2 files changed

+43
-27
lines changed

cns/service/main.go

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,7 +1064,7 @@ func main() {
10641064
return errors.Wrap(err, "failed to start fsnotify watcher, will retry")
10651065
}
10661066
return nil
1067-
}, retry.DelayType(retry.BackOffDelay), retry.Attempts(0), retry.Context(rootCtx)) // infinite cancellable exponential backoff retrier
1067+
}, retry.DelayType(retry.BackOffDelay), retry.UntilSucceeded(), retry.Context(rootCtx)) // infinite cancellable exponential backoff retrier
10681068
}()
10691069
}
10701070

@@ -1453,20 +1453,18 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
14531453
// aks addons to come up so retry a bit more aggresively here.
14541454
// will retry 10 times maxing out at a minute taking about 8 minutes before it gives up.
14551455
attempt := 0
1456-
err = retry.Do(func() error {
1456+
_ = retry.Do(func() error {
14571457
attempt++
14581458
logger.Printf("reconciling initial CNS state attempt: %d", attempt)
14591459
err = reconcileInitialCNSState(ctx, directscopedcli, httpRestServiceImplementation, podInfoByIPProvider)
14601460
if err != nil {
14611461
logger.Errorf("failed to reconcile initial CNS state, attempt: %d err: %v", attempt, err)
1462+
nncInitFailure.Inc()
14621463
}
14631464
return errors.Wrap(err, "failed to initialize CNS state")
1464-
}, retry.Context(ctx), retry.Delay(initCNSInitalDelay), retry.MaxDelay(time.Minute))
1465-
if err != nil {
1466-
return err
1467-
}
1465+
}, retry.Context(ctx), retry.Delay(initCNSInitalDelay), retry.MaxDelay(time.Minute), retry.UntilSucceeded())
14681466
logger.Printf("reconciled initial CNS state after %d attempts", attempt)
1469-
1467+
hasNNCInitialized.Set(1)
14701468
scheme := kuberuntime.NewScheme()
14711469
if err := corev1.AddToScheme(scheme); err != nil { //nolint:govet // intentional shadow
14721470
return errors.Wrap(err, "failed to add corev1 to scheme")

cns/service/metrics.go

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,31 +5,49 @@ import (
55
"sigs.k8s.io/controller-runtime/pkg/metrics"
66
)
77

8-
// managerStartFailures is a monotic counter which tracks the number of times the controller-runtime
9-
// manager failed to start. To drive alerting based on this metric, it is recommended to use the rate
10-
// of increase over a period of time. A positive rate of change indicates that the CNS is actively
11-
// failing and retrying.
12-
var managerStartFailures = prometheus.NewCounter(
13-
prometheus.CounterOpts{
14-
Name: "cns_ctrlmanager_start_failures_total",
15-
Help: "Number of times the controller-runtime manager failed to start.",
16-
},
17-
)
18-
19-
// nncReconcilerStartFailures is a monotic counter which tracks the number of times the NNC reconciler
20-
// has failed to start within the timeout period. To drive alerting based on this metric, it is
21-
// recommended to use the rate of increase over a period of time. A positive rate of change indicates
22-
// that the CNS is actively failing and retrying.
23-
var nncReconcilerStartFailures = prometheus.NewCounter(
24-
prometheus.CounterOpts{
25-
Name: "cns_nnc_reconciler_start_failures_total",
26-
Help: "Number of times the NNC reconciler has failed to start within the timeout period.",
27-
},
8+
var (
9+
// managerStartFailures is a monotic counter which tracks the number of times the controller-runtime
10+
// manager failed to start. To drive alerting based on this metric, it is recommended to use the rate
11+
// of increase over a period of time. A positive rate of change indicates that the CNS is actively
12+
// failing and retrying.
13+
managerStartFailures = prometheus.NewCounter(
14+
prometheus.CounterOpts{
15+
Name: "cns_ctrlmanager_start_failures_total",
16+
Help: "Number of times the controller-runtime manager failed to start.",
17+
},
18+
)
19+
// nncReconcilerStartFailures is a monotic counter which tracks the number of times the NNC reconciler
20+
// has failed to start within the timeout period. To drive alerting based on this metric, it is
21+
// recommended to use the rate of increase over a period of time. A positive rate of change indicates
22+
// that the CNS is actively failing and retrying.
23+
nncReconcilerStartFailures = prometheus.NewCounter(
24+
prometheus.CounterOpts{
25+
Name: "cns_nnc_reconciler_start_failures_total",
26+
Help: "Number of times the NNC reconciler has failed to start within the timeout period.",
27+
},
28+
)
29+
// nncInitFailure is a monotic counter which tracks the number of times the initial NNC reconcile
30+
// has failed.
31+
nncInitFailure = prometheus.NewCounter(
32+
prometheus.CounterOpts{
33+
Name: "cns_nnc_init_failures_total",
34+
Help: "Number of times the initial NNC reconcile has failed.",
35+
},
36+
)
37+
// hasNNCInitialized is a gauge which tracks whether the initial NNC reconcile has completed.
38+
hasNNCInitialized = prometheus.NewGauge(
39+
prometheus.GaugeOpts{
40+
Name: "cns_nnc_initialized",
41+
Help: "Whether the initial NNC reconcile has completed.",
42+
},
43+
)
2844
)
2945

3046
func init() {
3147
metrics.Registry.MustRegister(
3248
managerStartFailures,
3349
nncReconcilerStartFailures,
50+
nncInitFailure,
51+
hasNNCInitialized,
3452
)
3553
}

0 commit comments

Comments
 (0)