fix(prometheus): add condition_status gauge to fix non-resolving alerts (Issue #18)

mattmattox · claude · mattmattox · commit 7dc491f0f322 · 2026-02-10T20:02:47.000-06:00
All 11 condition-based PrometheusRule alerts used conditions_total (a counter) with > 0, meaning they could never self-resolve once fired. - Add condition_status GaugeVec (1=True, 0=False) for current state - Set gauge in ExportStatus() alongside existing counter increment - Update all 11 alert expressions to use gauge with correct polarity - Keep conditions_total counter for historical tracking - Add integration test verifying gauge state transitions Closes #18 Task #9412 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/helm/node-doctor/templates/prometheusrule.yaml b/helm/node-doctor/templates/prometheusrule.yaml
@@ -21,7 +21,7 @@ spec:
       rules:
         - alert: NodeDoctorCNIConfigInvalid
           expr: |
-            node_doctor_monitor_conditions_total{condition_type="CNIConfigValid", status="False"} > 0
+            node_doctor_monitor_condition_status{condition_type="CNIConfigValid"} == 0
           for: {{ .Values.prometheusRule.critical.cniConfigInvalid.for }}
           labels:
             severity: critical
@@ -38,7 +38,7 @@ spec:
 
         - alert: NodeDoctorCNIUnhealthy
           expr: |
-            node_doctor_monitor_conditions_total{condition_type="CNIHealthy", status="False"} > 0
+            node_doctor_monitor_condition_status{condition_type="CNIHealthy"} == 0
           for: {{ .Values.prometheusRule.critical.cniUnhealthy.for }}
           labels:
             severity: critical
@@ -55,7 +55,7 @@ spec:
 
         - alert: NodeDoctorNetworkPartitioned
           expr: |
-            node_doctor_monitor_conditions_total{condition_type="NetworkPartitioned", status="True"} > 0
+            node_doctor_monitor_condition_status{condition_type="NetworkPartitioned"} == 1
           for: {{ .Values.prometheusRule.critical.networkPartitioned.for }}
           labels:
             severity: critical
@@ -72,7 +72,7 @@ spec:
 
         - alert: NodeDoctorKubeletUnhealthy
           expr: |
-            node_doctor_monitor_conditions_total{condition_type="KubeletHealthy", status="False"} > 0
+            node_doctor_monitor_condition_status{condition_type="KubeletHealthy"} == 0
           for: {{ .Values.prometheusRule.critical.kubeletUnhealthy.for }}
           labels:
             severity: critical
@@ -89,7 +89,7 @@ spec:
 
         - alert: NodeDoctorReadOnlyFilesystem
           expr: |
-            node_doctor_monitor_conditions_total{condition_type="ReadOnlyFilesystem", status="True"} > 0
+            node_doctor_monitor_condition_status{condition_type="ReadOnlyFilesystem"} == 1
           for: {{ .Values.prometheusRule.critical.readOnlyFilesystem.for }}
           labels:
             severity: critical
@@ -111,7 +111,7 @@ spec:
       rules:
         - alert: NodeDoctorDNSResolutionFailed
           expr: |
-            node_doctor_monitor_conditions_total{condition_type="DNSResolutionFailed", status="True"} > 0
+            node_doctor_monitor_condition_status{condition_type="DNSResolutionFailed"} == 1
           for: {{ .Values.prometheusRule.warning.dnsResolutionFailed.for }}
           labels:
             severity: warning
@@ -122,7 +122,7 @@ spec:
 
         - alert: NodeDoctorDNSLatencyHigh
           expr: |
-            node_doctor_monitor_conditions_total{condition_type="DNSLatencyHigh", status="True"} > 0
+            node_doctor_monitor_condition_status{condition_type="DNSLatencyHigh"} == 1
           for: {{ .Values.prometheusRule.warning.dnsLatencyHigh.for }}
           labels:
             severity: warning
@@ -133,7 +133,7 @@ spec:
 
         - alert: NodeDoctorMemoryPressure
           expr: |
-            node_doctor_monitor_conditions_total{condition_type="MemoryPressure", status="True"} > 0
+            node_doctor_monitor_condition_status{condition_type="MemoryPressure"} == 1
           for: {{ .Values.prometheusRule.warning.memoryPressure.for }}
           labels:
             severity: warning
@@ -144,7 +144,7 @@ spec:
 
         - alert: NodeDoctorDiskPressure
           expr: |
-            node_doctor_monitor_conditions_total{condition_type="DiskPressure", status="True"} > 0
+            node_doctor_monitor_condition_status{condition_type="DiskPressure"} == 1
           for: {{ .Values.prometheusRule.warning.diskPressure.for }}
           labels:
             severity: warning
@@ -155,7 +155,7 @@ spec:
 
         - alert: NodeDoctorNetworkDegraded
           expr: |
-            node_doctor_monitor_conditions_total{condition_type="NetworkDegraded", status="True"} > 0
+            node_doctor_monitor_condition_status{condition_type="NetworkDegraded"} == 1
           for: {{ .Values.prometheusRule.warning.networkDegraded.for }}
           labels:
             severity: warning
@@ -188,7 +188,7 @@ spec:
 
         - alert: NodeDoctorAPIServerLatencyHigh
           expr: |
-            node_doctor_monitor_conditions_total{condition_type="APIServerLatencyHigh", status="True"} > 0
+            node_doctor_monitor_condition_status{condition_type="APIServerLatencyHigh"} == 1
           for: {{ .Values.prometheusRule.warning.apiServerLatencyHigh.for }}
           labels:
             severity: warning
diff --git a/pkg/exporters/prometheus/exporter.go b/pkg/exporters/prometheus/exporter.go
@@ -179,6 +179,14 @@ func (e *PrometheusExporter) ExportStatus(ctx context.Context, status *types.Sta
 	for _, condition := range status.Conditions {
 		e.metrics.ConditionsTotal.WithLabelValues(
 			e.nodeName, condition.Type, string(condition.Status)).Inc()
+
+		// Set gauge: 1 when condition is True, 0 when False/Unknown
+		val := 0.0
+		if condition.Status == types.ConditionTrue {
+			val = 1.0
+		}
+		e.metrics.ConditionStatus.WithLabelValues(
+			e.nodeName, condition.Type).Set(val)
 	}
 
 	// Update events
diff --git a/pkg/exporters/prometheus/integration_test.go b/pkg/exporters/prometheus/integration_test.go
@@ -245,6 +245,7 @@ func TestPrometheusFormat(t *testing.T) {
 	expectedCustomMetrics := []string{
 		"test_problems_total",
 		"test_status_updates_total",
+		"test_condition_status",
 		"test_info",
 	}
 
@@ -255,6 +256,109 @@ func TestPrometheusFormat(t *testing.T) {
 	}
 }
 
+func TestConditionStatusGauge(t *testing.T) {
+	config := &types.PrometheusExporterConfig{
+		Enabled:   true,
+		Port:      9112,
+		Path:      "/metrics",
+		Namespace: "test",
+	}
+	settings := &types.GlobalSettings{NodeName: "test-node"}
+
+	exporter, err := NewPrometheusExporter(config, settings)
+	if err != nil {
+		t.Fatalf("failed to create exporter: %v", err)
+	}
+
+	ctx := context.Background()
+	err = exporter.Start(ctx)
+	if err != nil {
+		t.Fatalf("failed to start exporter: %v", err)
+	}
+	defer exporter.Stop()
+
+	// Export status with condition True
+	status := &types.Status{
+		Source:    "test",
+		Timestamp: time.Now(),
+		Conditions: []types.Condition{
+			{
+				Type:       "NetworkPartitioned",
+				Status:     types.ConditionTrue,
+				Reason:     "PeerUnreachable",
+				Message:    "Cannot reach peers",
+				Transition: time.Now(),
+			},
+		},
+	}
+	exporter.ExportStatus(ctx, status)
+
+	time.Sleep(100 * time.Millisecond)
+
+	// Scrape and verify gauge == 1
+	body := scrapeMetrics(t, config.Port, config.Path)
+	if !strings.Contains(body, `test_condition_status{condition_type="NetworkPartitioned"`) {
+		t.Error("condition_status metric not found for NetworkPartitioned")
+	}
+	if !containsMetricWithValue(body, "test_condition_status", "NetworkPartitioned", "1") {
+		t.Error("expected condition_status=1 for NetworkPartitioned=True")
+	}
+
+	// Export status with condition False — gauge should update to 0
+	status2 := &types.Status{
+		Source:    "test",
+		Timestamp: time.Now(),
+		Conditions: []types.Condition{
+			{
+				Type:       "NetworkPartitioned",
+				Status:     types.ConditionFalse,
+				Reason:     "PeersReachable",
+				Message:    "All peers reachable",
+				Transition: time.Now(),
+			},
+		},
+	}
+	exporter.ExportStatus(ctx, status2)
+
+	time.Sleep(100 * time.Millisecond)
+
+	// Scrape and verify gauge == 0
+	body = scrapeMetrics(t, config.Port, config.Path)
+	if !containsMetricWithValue(body, "test_condition_status", "NetworkPartitioned", "0") {
+		t.Error("expected condition_status=0 for NetworkPartitioned=False, but gauge did not update")
+	}
+}
+
+// scrapeMetrics fetches the /metrics endpoint and returns the body as a string.
+func scrapeMetrics(t *testing.T, port int, path string) string {
+	t.Helper()
+	resp, err := http.Get(fmt.Sprintf("http://localhost:%d%s", port, path))
+	if err != nil {
+		t.Fatalf("failed to get metrics: %v", err)
+	}
+	defer resp.Body.Close()
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		t.Fatalf("failed to read response body: %v", err)
+	}
+	return string(body)
+}
+
+// containsMetricWithValue checks if a Prometheus text exposition contains a metric
+// with the given name, condition_type label value, and numeric value.
+func containsMetricWithValue(body, metricName, conditionType, value string) bool {
+	scanner := bufio.NewScanner(strings.NewReader(body))
+	for scanner.Scan() {
+		line := scanner.Text()
+		if strings.HasPrefix(line, metricName) &&
+			strings.Contains(line, fmt.Sprintf(`condition_type="%s"`, conditionType)) &&
+			strings.HasSuffix(strings.TrimSpace(line), " "+value) {
+			return true
+		}
+	}
+	return false
+}
+
 func TestConcurrentScrapes(t *testing.T) {
 	config := &types.PrometheusExporterConfig{
 		Enabled:   true,
diff --git a/pkg/exporters/prometheus/metrics.go b/pkg/exporters/prometheus/metrics.go
@@ -19,6 +19,7 @@ type Metrics struct {
 	// Gauge metrics
 	ProblemsActive   *prometheus.GaugeVec
 	MonitorUp        *prometheus.GaugeVec
+	ConditionStatus  *prometheus.GaugeVec
 	Info             *prometheus.GaugeVec
 	StartTimeSeconds *prometheus.GaugeVec
 	UptimeSeconds    *prometheus.GaugeVec
@@ -126,6 +127,17 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
 		),
 
 		// Gauge metrics
+		ConditionStatus: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace:   namespace,
+				Subsystem:   subsystem,
+				Name:        "condition_status",
+				Help:        "Current status of node conditions (1=active/True, 0=inactive/False)",
+				ConstLabels: labels,
+			},
+			[]string{"node", "condition_type"},
+		),
+
 		ProblemsActive: prometheus.NewGaugeVec(
 			prometheus.GaugeOpts{
 				Namespace:   namespace,
@@ -358,6 +370,7 @@ func (m *Metrics) Register(registry *prometheus.Registry) error {
 		m.ExportErrorsTotal,
 		m.ProblemsActive,
 		m.MonitorUp,
+		m.ConditionStatus,
 		m.Info,
 		m.StartTimeSeconds,
 		m.UptimeSeconds,
@@ -398,6 +411,7 @@ func (m *Metrics) Unregister(registry *prometheus.Registry) {
 		m.ExportErrorsTotal,
 		m.ProblemsActive,
 		m.MonitorUp,
+		m.ConditionStatus,
 		m.Info,
 		m.StartTimeSeconds,
 		m.UptimeSeconds,
diff --git a/pkg/exporters/prometheus/metrics_test.go b/pkg/exporters/prometheus/metrics_test.go
@@ -84,6 +84,9 @@ func TestNewMetrics(t *testing.T) {
 			if metrics.MonitorUp == nil {
 				t.Error("MonitorUp metric not created")
 			}
+			if metrics.ConditionStatus == nil {
+				t.Error("ConditionStatus metric not created")
+			}
 			if metrics.Info == nil {
 				t.Error("Info metric not created")
 			}
@@ -175,6 +178,8 @@ func TestMetricUpdates(t *testing.T) {
 	// Test gauge metrics
 	metrics.ProblemsActive.WithLabelValues("test-node", "DiskPressure", "warning").Set(5)
 	metrics.MonitorUp.WithLabelValues("test-node", "disk-monitor", "disk").Set(1)
+	metrics.ConditionStatus.WithLabelValues("test-node", "NetworkPartitioned").Set(1)
+	metrics.ConditionStatus.WithLabelValues("test-node", "CNIHealthy").Set(0)
 	metrics.Info.WithLabelValues("test-node", "1.0.0", "abc123", "go1.21", "2023-01-01").Set(1)
 	metrics.StartTimeSeconds.WithLabelValues("test-node").Set(1640995200)
 	metrics.UptimeSeconds.WithLabelValues("test-node").Set(3600)
@@ -211,6 +216,7 @@ func TestMetricUpdates(t *testing.T) {
 		"test_export_errors_total",
 		"test_problems_active",
 		"test_monitor_up",
+		"test_condition_status",
 		"test_info",
 		"test_start_time_seconds",
 		"test_uptime_seconds",