feat: add metric for NNC init failures (#3453)

rbtr · web-flow · commit d61a128c49e1 · 2025-06-13T18:09:15.000Z
Signed-off-by: Evan Baker &lt;rbtr@users.noreply.github.com&gt;
diff --git a/cns/service/main.go b/cns/service/main.go
@@ -1064,7 +1064,7 @@ func main() {
 					return errors.Wrap(err, "failed to start fsnotify watcher, will retry")
 				}
 				return nil
-			}, retry.DelayType(retry.BackOffDelay), retry.Attempts(0), retry.Context(rootCtx)) // infinite cancellable exponential backoff retrier
+			}, retry.DelayType(retry.BackOffDelay), retry.UntilSucceeded(), retry.Context(rootCtx)) // infinite cancellable exponential backoff retrier
 		}()
 	}
 
@@ -1453,20 +1453,18 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
 	// aks addons to come up so retry a bit more aggresively here.
 	// will retry 10 times maxing out at a minute taking about 8 minutes before it gives up.
 	attempt := 0
-	err = retry.Do(func() error {
+	_ = retry.Do(func() error {
 		attempt++
 		logger.Printf("reconciling initial CNS state attempt: %d", attempt)
 		err = reconcileInitialCNSState(ctx, directscopedcli, httpRestServiceImplementation, podInfoByIPProvider)
 		if err != nil {
 			logger.Errorf("failed to reconcile initial CNS state, attempt: %d err: %v", attempt, err)
+			nncInitFailure.Inc()
 		}
 		return errors.Wrap(err, "failed to initialize CNS state")
-	}, retry.Context(ctx), retry.Delay(initCNSInitalDelay), retry.MaxDelay(time.Minute))
-	if err != nil {
-		return err
-	}
+	}, retry.Context(ctx), retry.Delay(initCNSInitalDelay), retry.MaxDelay(time.Minute), retry.UntilSucceeded())
 	logger.Printf("reconciled initial CNS state after %d attempts", attempt)
-
+	hasNNCInitialized.Set(1)
 	scheme := kuberuntime.NewScheme()
 	if err := corev1.AddToScheme(scheme); err != nil { //nolint:govet // intentional shadow
 		return errors.Wrap(err, "failed to add corev1 to scheme")
diff --git a/cns/service/metrics.go b/cns/service/metrics.go
@@ -5,31 +5,49 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/metrics"
 )
 
-// managerStartFailures is a monotic counter which tracks the number of times the controller-runtime
-// manager failed to start. To drive alerting based on this metric, it is recommended to use the rate
-// of increase over a period of time. A positive rate of change indicates that the CNS is actively
-// failing and retrying.
-var managerStartFailures = prometheus.NewCounter(
-	prometheus.CounterOpts{
-		Name: "cns_ctrlmanager_start_failures_total",
-		Help: "Number of times the controller-runtime manager failed to start.",
-	},
-)
-
-// nncReconcilerStartFailures is a monotic counter which tracks the number of times the NNC reconciler
-// has failed to start within the timeout period. To drive alerting based on this metric, it is
-// recommended to use the rate of increase over a period of time. A positive rate of change indicates
-// that the CNS is actively failing and retrying.
-var nncReconcilerStartFailures = prometheus.NewCounter(
-	prometheus.CounterOpts{
-		Name: "cns_nnc_reconciler_start_failures_total",
-		Help: "Number of times the NNC reconciler has failed to start within the timeout period.",
-	},
+var (
+	// managerStartFailures is a monotic counter which tracks the number of times the controller-runtime
+	// manager failed to start. To drive alerting based on this metric, it is recommended to use the rate
+	// of increase over a period of time. A positive rate of change indicates that the CNS is actively
+	// failing and retrying.
+	managerStartFailures = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Name: "cns_ctrlmanager_start_failures_total",
+			Help: "Number of times the controller-runtime manager failed to start.",
+		},
+	)
+	// nncReconcilerStartFailures is a monotic counter which tracks the number of times the NNC reconciler
+	// has failed to start within the timeout period. To drive alerting based on this metric, it is
+	// recommended to use the rate of increase over a period of time. A positive rate of change indicates
+	// that the CNS is actively failing and retrying.
+	nncReconcilerStartFailures = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Name: "cns_nnc_reconciler_start_failures_total",
+			Help: "Number of times the NNC reconciler has failed to start within the timeout period.",
+		},
+	)
+	// nncInitFailure is a monotic counter which tracks the number of times the initial NNC reconcile
+	// has failed.
+	nncInitFailure = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Name: "cns_nnc_init_failures_total",
+			Help: "Number of times the initial NNC reconcile has failed.",
+		},
+	)
+	// hasNNCInitialized is a gauge which tracks whether the initial NNC reconcile has completed.
+	hasNNCInitialized = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Name: "cns_nnc_initialized",
+			Help: "Whether the initial NNC reconcile has completed.",
+		},
+	)
 )
 
 func init() {
 	metrics.Registry.MustRegister(
 		managerStartFailures,
 		nncReconcilerStartFailures,
+		nncInitFailure,
+		hasNNCInitialized,
 	)
 }