Skip to content

Commit 7dc491f

Browse files
mattmattoxclaude
andcommitted
fix(prometheus): add condition_status gauge to fix non-resolving alerts (Issue #18)
All 11 condition-based PrometheusRule alerts used conditions_total (a counter) with > 0, meaning they could never self-resolve once fired. - Add condition_status GaugeVec (1=True, 0=False) for current state - Set gauge in ExportStatus() alongside existing counter increment - Update all 11 alert expressions to use gauge with correct polarity - Keep conditions_total counter for historical tracking - Add integration test verifying gauge state transitions Closes #18 Task #9412 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 8beb6c3 commit 7dc491f

File tree

5 files changed

+143
-11
lines changed

5 files changed

+143
-11
lines changed

helm/node-doctor/templates/prometheusrule.yaml

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ spec:
2121
rules:
2222
- alert: NodeDoctorCNIConfigInvalid
2323
expr: |
24-
node_doctor_monitor_conditions_total{condition_type="CNIConfigValid", status="False"} > 0
24+
node_doctor_monitor_condition_status{condition_type="CNIConfigValid"} == 0
2525
for: {{ .Values.prometheusRule.critical.cniConfigInvalid.for }}
2626
labels:
2727
severity: critical
@@ -38,7 +38,7 @@ spec:
3838

3939
- alert: NodeDoctorCNIUnhealthy
4040
expr: |
41-
node_doctor_monitor_conditions_total{condition_type="CNIHealthy", status="False"} > 0
41+
node_doctor_monitor_condition_status{condition_type="CNIHealthy"} == 0
4242
for: {{ .Values.prometheusRule.critical.cniUnhealthy.for }}
4343
labels:
4444
severity: critical
@@ -55,7 +55,7 @@ spec:
5555

5656
- alert: NodeDoctorNetworkPartitioned
5757
expr: |
58-
node_doctor_monitor_conditions_total{condition_type="NetworkPartitioned", status="True"} > 0
58+
node_doctor_monitor_condition_status{condition_type="NetworkPartitioned"} == 1
5959
for: {{ .Values.prometheusRule.critical.networkPartitioned.for }}
6060
labels:
6161
severity: critical
@@ -72,7 +72,7 @@ spec:
7272

7373
- alert: NodeDoctorKubeletUnhealthy
7474
expr: |
75-
node_doctor_monitor_conditions_total{condition_type="KubeletHealthy", status="False"} > 0
75+
node_doctor_monitor_condition_status{condition_type="KubeletHealthy"} == 0
7676
for: {{ .Values.prometheusRule.critical.kubeletUnhealthy.for }}
7777
labels:
7878
severity: critical
@@ -89,7 +89,7 @@ spec:
8989

9090
- alert: NodeDoctorReadOnlyFilesystem
9191
expr: |
92-
node_doctor_monitor_conditions_total{condition_type="ReadOnlyFilesystem", status="True"} > 0
92+
node_doctor_monitor_condition_status{condition_type="ReadOnlyFilesystem"} == 1
9393
for: {{ .Values.prometheusRule.critical.readOnlyFilesystem.for }}
9494
labels:
9595
severity: critical
@@ -111,7 +111,7 @@ spec:
111111
rules:
112112
- alert: NodeDoctorDNSResolutionFailed
113113
expr: |
114-
node_doctor_monitor_conditions_total{condition_type="DNSResolutionFailed", status="True"} > 0
114+
node_doctor_monitor_condition_status{condition_type="DNSResolutionFailed"} == 1
115115
for: {{ .Values.prometheusRule.warning.dnsResolutionFailed.for }}
116116
labels:
117117
severity: warning
@@ -122,7 +122,7 @@ spec:
122122

123123
- alert: NodeDoctorDNSLatencyHigh
124124
expr: |
125-
node_doctor_monitor_conditions_total{condition_type="DNSLatencyHigh", status="True"} > 0
125+
node_doctor_monitor_condition_status{condition_type="DNSLatencyHigh"} == 1
126126
for: {{ .Values.prometheusRule.warning.dnsLatencyHigh.for }}
127127
labels:
128128
severity: warning
@@ -133,7 +133,7 @@ spec:
133133

134134
- alert: NodeDoctorMemoryPressure
135135
expr: |
136-
node_doctor_monitor_conditions_total{condition_type="MemoryPressure", status="True"} > 0
136+
node_doctor_monitor_condition_status{condition_type="MemoryPressure"} == 1
137137
for: {{ .Values.prometheusRule.warning.memoryPressure.for }}
138138
labels:
139139
severity: warning
@@ -144,7 +144,7 @@ spec:
144144

145145
- alert: NodeDoctorDiskPressure
146146
expr: |
147-
node_doctor_monitor_conditions_total{condition_type="DiskPressure", status="True"} > 0
147+
node_doctor_monitor_condition_status{condition_type="DiskPressure"} == 1
148148
for: {{ .Values.prometheusRule.warning.diskPressure.for }}
149149
labels:
150150
severity: warning
@@ -155,7 +155,7 @@ spec:
155155

156156
- alert: NodeDoctorNetworkDegraded
157157
expr: |
158-
node_doctor_monitor_conditions_total{condition_type="NetworkDegraded", status="True"} > 0
158+
node_doctor_monitor_condition_status{condition_type="NetworkDegraded"} == 1
159159
for: {{ .Values.prometheusRule.warning.networkDegraded.for }}
160160
labels:
161161
severity: warning
@@ -188,7 +188,7 @@ spec:
188188

189189
- alert: NodeDoctorAPIServerLatencyHigh
190190
expr: |
191-
node_doctor_monitor_conditions_total{condition_type="APIServerLatencyHigh", status="True"} > 0
191+
node_doctor_monitor_condition_status{condition_type="APIServerLatencyHigh"} == 1
192192
for: {{ .Values.prometheusRule.warning.apiServerLatencyHigh.for }}
193193
labels:
194194
severity: warning

pkg/exporters/prometheus/exporter.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,14 @@ func (e *PrometheusExporter) ExportStatus(ctx context.Context, status *types.Sta
179179
for _, condition := range status.Conditions {
180180
e.metrics.ConditionsTotal.WithLabelValues(
181181
e.nodeName, condition.Type, string(condition.Status)).Inc()
182+
183+
// Set gauge: 1 when condition is True, 0 when False/Unknown
184+
val := 0.0
185+
if condition.Status == types.ConditionTrue {
186+
val = 1.0
187+
}
188+
e.metrics.ConditionStatus.WithLabelValues(
189+
e.nodeName, condition.Type).Set(val)
182190
}
183191

184192
// Update events

pkg/exporters/prometheus/integration_test.go

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ func TestPrometheusFormat(t *testing.T) {
245245
expectedCustomMetrics := []string{
246246
"test_problems_total",
247247
"test_status_updates_total",
248+
"test_condition_status",
248249
"test_info",
249250
}
250251

@@ -255,6 +256,109 @@ func TestPrometheusFormat(t *testing.T) {
255256
}
256257
}
257258

259+
func TestConditionStatusGauge(t *testing.T) {
260+
config := &types.PrometheusExporterConfig{
261+
Enabled: true,
262+
Port: 9112,
263+
Path: "/metrics",
264+
Namespace: "test",
265+
}
266+
settings := &types.GlobalSettings{NodeName: "test-node"}
267+
268+
exporter, err := NewPrometheusExporter(config, settings)
269+
if err != nil {
270+
t.Fatalf("failed to create exporter: %v", err)
271+
}
272+
273+
ctx := context.Background()
274+
err = exporter.Start(ctx)
275+
if err != nil {
276+
t.Fatalf("failed to start exporter: %v", err)
277+
}
278+
defer exporter.Stop()
279+
280+
// Export status with condition True
281+
status := &types.Status{
282+
Source: "test",
283+
Timestamp: time.Now(),
284+
Conditions: []types.Condition{
285+
{
286+
Type: "NetworkPartitioned",
287+
Status: types.ConditionTrue,
288+
Reason: "PeerUnreachable",
289+
Message: "Cannot reach peers",
290+
Transition: time.Now(),
291+
},
292+
},
293+
}
294+
exporter.ExportStatus(ctx, status)
295+
296+
time.Sleep(100 * time.Millisecond)
297+
298+
// Scrape and verify gauge == 1
299+
body := scrapeMetrics(t, config.Port, config.Path)
300+
if !strings.Contains(body, `test_condition_status{condition_type="NetworkPartitioned"`) {
301+
t.Error("condition_status metric not found for NetworkPartitioned")
302+
}
303+
if !containsMetricWithValue(body, "test_condition_status", "NetworkPartitioned", "1") {
304+
t.Error("expected condition_status=1 for NetworkPartitioned=True")
305+
}
306+
307+
// Export status with condition False — gauge should update to 0
308+
status2 := &types.Status{
309+
Source: "test",
310+
Timestamp: time.Now(),
311+
Conditions: []types.Condition{
312+
{
313+
Type: "NetworkPartitioned",
314+
Status: types.ConditionFalse,
315+
Reason: "PeersReachable",
316+
Message: "All peers reachable",
317+
Transition: time.Now(),
318+
},
319+
},
320+
}
321+
exporter.ExportStatus(ctx, status2)
322+
323+
time.Sleep(100 * time.Millisecond)
324+
325+
// Scrape and verify gauge == 0
326+
body = scrapeMetrics(t, config.Port, config.Path)
327+
if !containsMetricWithValue(body, "test_condition_status", "NetworkPartitioned", "0") {
328+
t.Error("expected condition_status=0 for NetworkPartitioned=False, but gauge did not update")
329+
}
330+
}
331+
332+
// scrapeMetrics fetches the /metrics endpoint and returns the body as a string.
333+
func scrapeMetrics(t *testing.T, port int, path string) string {
334+
t.Helper()
335+
resp, err := http.Get(fmt.Sprintf("http://localhost:%d%s", port, path))
336+
if err != nil {
337+
t.Fatalf("failed to get metrics: %v", err)
338+
}
339+
defer resp.Body.Close()
340+
body, err := io.ReadAll(resp.Body)
341+
if err != nil {
342+
t.Fatalf("failed to read response body: %v", err)
343+
}
344+
return string(body)
345+
}
346+
347+
// containsMetricWithValue checks if a Prometheus text exposition contains a metric
348+
// with the given name, condition_type label value, and numeric value.
349+
func containsMetricWithValue(body, metricName, conditionType, value string) bool {
350+
scanner := bufio.NewScanner(strings.NewReader(body))
351+
for scanner.Scan() {
352+
line := scanner.Text()
353+
if strings.HasPrefix(line, metricName) &&
354+
strings.Contains(line, fmt.Sprintf(`condition_type="%s"`, conditionType)) &&
355+
strings.HasSuffix(strings.TrimSpace(line), " "+value) {
356+
return true
357+
}
358+
}
359+
return false
360+
}
361+
258362
func TestConcurrentScrapes(t *testing.T) {
259363
config := &types.PrometheusExporterConfig{
260364
Enabled: true,

pkg/exporters/prometheus/metrics.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ type Metrics struct {
1919
// Gauge metrics
2020
ProblemsActive *prometheus.GaugeVec
2121
MonitorUp *prometheus.GaugeVec
22+
ConditionStatus *prometheus.GaugeVec
2223
Info *prometheus.GaugeVec
2324
StartTimeSeconds *prometheus.GaugeVec
2425
UptimeSeconds *prometheus.GaugeVec
@@ -126,6 +127,17 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
126127
),
127128

128129
// Gauge metrics
130+
ConditionStatus: prometheus.NewGaugeVec(
131+
prometheus.GaugeOpts{
132+
Namespace: namespace,
133+
Subsystem: subsystem,
134+
Name: "condition_status",
135+
Help: "Current status of node conditions (1=active/True, 0=inactive/False)",
136+
ConstLabels: labels,
137+
},
138+
[]string{"node", "condition_type"},
139+
),
140+
129141
ProblemsActive: prometheus.NewGaugeVec(
130142
prometheus.GaugeOpts{
131143
Namespace: namespace,
@@ -358,6 +370,7 @@ func (m *Metrics) Register(registry *prometheus.Registry) error {
358370
m.ExportErrorsTotal,
359371
m.ProblemsActive,
360372
m.MonitorUp,
373+
m.ConditionStatus,
361374
m.Info,
362375
m.StartTimeSeconds,
363376
m.UptimeSeconds,
@@ -398,6 +411,7 @@ func (m *Metrics) Unregister(registry *prometheus.Registry) {
398411
m.ExportErrorsTotal,
399412
m.ProblemsActive,
400413
m.MonitorUp,
414+
m.ConditionStatus,
401415
m.Info,
402416
m.StartTimeSeconds,
403417
m.UptimeSeconds,

pkg/exporters/prometheus/metrics_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ func TestNewMetrics(t *testing.T) {
8484
if metrics.MonitorUp == nil {
8585
t.Error("MonitorUp metric not created")
8686
}
87+
if metrics.ConditionStatus == nil {
88+
t.Error("ConditionStatus metric not created")
89+
}
8790
if metrics.Info == nil {
8891
t.Error("Info metric not created")
8992
}
@@ -175,6 +178,8 @@ func TestMetricUpdates(t *testing.T) {
175178
// Test gauge metrics
176179
metrics.ProblemsActive.WithLabelValues("test-node", "DiskPressure", "warning").Set(5)
177180
metrics.MonitorUp.WithLabelValues("test-node", "disk-monitor", "disk").Set(1)
181+
metrics.ConditionStatus.WithLabelValues("test-node", "NetworkPartitioned").Set(1)
182+
metrics.ConditionStatus.WithLabelValues("test-node", "CNIHealthy").Set(0)
178183
metrics.Info.WithLabelValues("test-node", "1.0.0", "abc123", "go1.21", "2023-01-01").Set(1)
179184
metrics.StartTimeSeconds.WithLabelValues("test-node").Set(1640995200)
180185
metrics.UptimeSeconds.WithLabelValues("test-node").Set(3600)
@@ -211,6 +216,7 @@ func TestMetricUpdates(t *testing.T) {
211216
"test_export_errors_total",
212217
"test_problems_active",
213218
"test_monitor_up",
219+
"test_condition_status",
214220
"test_info",
215221
"test_start_time_seconds",
216222
"test_uptime_seconds",

0 commit comments

Comments
 (0)