Skip to content

Commit 5b800aa

Browse files
committed
feat: add metrics for reconciliation and status updates
Signed-off-by: Kevin Conner <[email protected]>
1 parent bd28f29 commit 5b800aa

File tree

6 files changed

+425
-10
lines changed

6 files changed

+425
-10
lines changed

go.mod

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ require (
88
github.com/go-logr/logr v1.4.2
99
github.com/onsi/ginkgo/v2 v2.22.0
1010
github.com/onsi/gomega v1.36.1
11+
github.com/prometheus/client_golang v1.19.1
12+
github.com/prometheus/client_model v0.6.1
13+
github.com/stretchr/testify v1.9.0
1114
golang.org/x/time v0.7.0
1215
k8s.io/api v0.32.1
1316
k8s.io/apimachinery v0.32.1
@@ -54,8 +57,7 @@ require (
5457
github.com/modern-go/reflect2 v1.0.2 // indirect
5558
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
5659
github.com/pkg/errors v0.9.1 // indirect
57-
github.com/prometheus/client_golang v1.19.1 // indirect
58-
github.com/prometheus/client_model v0.6.1 // indirect
60+
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
5961
github.com/prometheus/common v0.55.0 // indirect
6062
github.com/prometheus/procfs v0.15.1 // indirect
6163
github.com/spf13/cobra v1.8.1 // indirect

internal/metrics/metrics.go

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
// Package metrics provides Prometheus metrics for the model validation operator.
2+
package metrics
3+
4+
import (
5+
"github.com/prometheus/client_golang/prometheus"
6+
"sigs.k8s.io/controller-runtime/pkg/metrics"
7+
)
8+
9+
const (
10+
// Metric label names
11+
labelNamespace = "namespace"
12+
labelModelValidation = "model_validation"
13+
labelPodState = "pod_state"
14+
labelStatusUpdateResult = "result"
15+
labelDriftType = "drift_type"
16+
17+
// PodStateInjected represents pods with model validation finalizers
18+
PodStateInjected = "injected"
19+
// PodStateUninjected represents pods without model validation finalizers
20+
PodStateUninjected = "uninjected"
21+
// PodStateOrphaned represents pods with configuration drift
22+
PodStateOrphaned = "orphaned"
23+
24+
// StatusUpdateSuccess indicates a successful status update
25+
StatusUpdateSuccess = "success"
26+
// StatusUpdateFailure indicates a failed status update
27+
StatusUpdateFailure = "failure"
28+
)
29+
30+
var (
31+
// ModelValidationPodCounts tracks the current number of pods in each state per ModelValidation
32+
ModelValidationPodCounts = prometheus.NewGaugeVec(
33+
prometheus.GaugeOpts{
34+
Namespace: "model_validation_operator",
35+
Name: "modelvalidation_pod_count",
36+
Help: "Current number of pods tracked per ModelValidation by state",
37+
},
38+
[]string{labelNamespace, labelModelValidation, labelPodState},
39+
)
40+
41+
// PodStateTransitionsTotal tracks pod state transitions
42+
PodStateTransitionsTotal = prometheus.NewCounterVec(
43+
prometheus.CounterOpts{
44+
Namespace: "model_validation_operator",
45+
Name: "pod_state_transitions_total",
46+
Help: "Total number of pod state transitions",
47+
},
48+
[]string{labelNamespace, labelModelValidation, "from_state", "to_state"},
49+
)
50+
51+
// StatusUpdatesTotal tracks ModelValidation status updates
52+
StatusUpdatesTotal = prometheus.NewCounterVec(
53+
prometheus.CounterOpts{
54+
Namespace: "model_validation_operator",
55+
Name: "status_updates_total",
56+
Help: "Total number of ModelValidation status updates",
57+
},
58+
[]string{labelNamespace, labelModelValidation, labelStatusUpdateResult},
59+
)
60+
61+
// ConfigurationDriftEventsTotal tracks configuration drift events
62+
ConfigurationDriftEventsTotal = prometheus.NewCounterVec(
63+
prometheus.CounterOpts{
64+
Namespace: "model_validation_operator",
65+
Name: "configuration_drift_events_total",
66+
Help: "Total number of configuration drift events detected",
67+
},
68+
[]string{labelNamespace, labelModelValidation, labelDriftType},
69+
)
70+
71+
// ModelValidationCRsTotal tracks total number of ModelValidation CRs per namespace
72+
// Does not include authMethod for namespace-level tracking
73+
ModelValidationCRsTotal = prometheus.NewGaugeVec(
74+
prometheus.GaugeOpts{
75+
Namespace: "model_validation_operator",
76+
Name: "modelvalidation_crs_total",
77+
Help: "Total number of ModelValidation CRs being tracked per namespace",
78+
},
79+
[]string{labelNamespace},
80+
)
81+
82+
// StatusUpdateDuration tracks the duration of status update operations
83+
StatusUpdateDuration = prometheus.NewHistogramVec(
84+
prometheus.HistogramOpts{
85+
Namespace: "model_validation_operator",
86+
Name: "status_update_duration_seconds",
87+
Help: "Duration of ModelValidation status update operations",
88+
Buckets: prometheus.DefBuckets,
89+
},
90+
[]string{labelNamespace, labelModelValidation, labelStatusUpdateResult},
91+
)
92+
93+
// QueueSize tracks the current size of the status update queue
94+
QueueSize = prometheus.NewGauge(
95+
prometheus.GaugeOpts{
96+
Namespace: "model_validation_operator",
97+
Name: "status_update_queue_size",
98+
Help: "Current size of the status update queue",
99+
},
100+
)
101+
102+
// RetryAttemptsTotal tracks retry attempts for status updates
103+
RetryAttemptsTotal = prometheus.NewCounterVec(
104+
prometheus.CounterOpts{
105+
Namespace: "model_validation_operator",
106+
Name: "status_update_retry_attempts_total",
107+
Help: "Total number of status update retry attempts",
108+
},
109+
[]string{labelNamespace, labelModelValidation},
110+
)
111+
)
112+
113+
func init() {
114+
metrics.Registry.MustRegister(
115+
ModelValidationPodCounts,
116+
PodStateTransitionsTotal,
117+
StatusUpdatesTotal,
118+
ConfigurationDriftEventsTotal,
119+
ModelValidationCRsTotal,
120+
StatusUpdateDuration,
121+
QueueSize,
122+
RetryAttemptsTotal,
123+
)
124+
}
125+
126+
// RecordPodCount records the current pod count for a ModelValidation
127+
func RecordPodCount(namespace, modelValidation, podState string, count float64) {
128+
ModelValidationPodCounts.WithLabelValues(namespace, modelValidation, podState).Set(count)
129+
}
130+
131+
// RecordPodStateTransition records a pod state transition
132+
func RecordPodStateTransition(namespace, modelValidation, fromState, toState string) {
133+
PodStateTransitionsTotal.WithLabelValues(namespace, modelValidation, fromState, toState).Inc()
134+
}
135+
136+
// RecordStatusUpdate records a status update result
137+
func RecordStatusUpdate(namespace, modelValidation, result string) {
138+
StatusUpdatesTotal.WithLabelValues(namespace, modelValidation, result).Inc()
139+
}
140+
141+
// RecordConfigurationDrift records a configuration drift event
142+
func RecordConfigurationDrift(namespace, modelValidation, driftType string) {
143+
ConfigurationDriftEventsTotal.WithLabelValues(namespace, modelValidation, driftType).Inc()
144+
}
145+
146+
// RecordModelValidationCR records the current number of ModelValidation CRs per namespace
147+
func RecordModelValidationCR(namespace string, count float64) {
148+
ModelValidationCRsTotal.WithLabelValues(namespace).Set(count)
149+
}
150+
151+
// RecordStatusUpdateDuration records the duration of a status update
152+
func RecordStatusUpdateDuration(namespace, modelValidation, result string, duration float64) {
153+
StatusUpdateDuration.WithLabelValues(namespace, modelValidation, result).Observe(duration)
154+
}
155+
156+
// SetQueueSize sets the current queue size
157+
func SetQueueSize(size float64) {
158+
QueueSize.Set(size)
159+
}
160+
161+
// RecordRetryAttempt records a retry attempt
162+
func RecordRetryAttempt(namespace, modelValidation string) {
163+
RetryAttemptsTotal.WithLabelValues(namespace, modelValidation).Inc()
164+
}
165+
166+
// RecordMultiplePodStateTransitions records multiple identical pod state transitions
167+
func RecordMultiplePodStateTransitions(namespace, modelValidation, fromState, toState string, count int) {
168+
if count > 0 {
169+
PodStateTransitionsTotal.WithLabelValues(namespace, modelValidation, fromState, toState).Add(float64(count))
170+
}
171+
}

internal/metrics/metrics_test.go

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
package metrics
2+
3+
import (
4+
"testing"
5+
6+
"github.com/prometheus/client_golang/prometheus"
7+
dto "github.com/prometheus/client_model/go"
8+
"github.com/stretchr/testify/assert"
9+
"sigs.k8s.io/controller-runtime/pkg/metrics"
10+
)
11+
12+
const (
13+
testNamespace = "test-namespace"
14+
testModelValidation = "test-mv"
15+
)
16+
17+
func TestMetricsDefinition(t *testing.T) {
18+
// Test that all metric variables are defined
19+
assert.NotNil(t, ModelValidationPodCounts)
20+
assert.NotNil(t, PodStateTransitionsTotal)
21+
assert.NotNil(t, StatusUpdatesTotal)
22+
assert.NotNil(t, ConfigurationDriftEventsTotal)
23+
assert.NotNil(t, ModelValidationCRsTotal)
24+
assert.NotNil(t, StatusUpdateDuration)
25+
assert.NotNil(t, QueueSize)
26+
assert.NotNil(t, RetryAttemptsTotal)
27+
}
28+
29+
// Test helper to verify gauge metrics
30+
func verifyGaugeMetric(t *testing.T, gauge *prometheus.GaugeVec, labels []string, expectedValue float64) {
31+
metric := gauge.WithLabelValues(labels...)
32+
metricDto := &dto.Metric{}
33+
err := metric.Write(metricDto)
34+
assert.NoError(t, err)
35+
assert.Equal(t, expectedValue, metricDto.GetGauge().GetValue())
36+
}
37+
38+
// Test helper to verify counter metrics
39+
func verifyCounterIncrement(t *testing.T, counter *prometheus.CounterVec, labels []string) float64 {
40+
metric := counter.WithLabelValues(labels...)
41+
metricDto := &dto.Metric{}
42+
err := metric.Write(metricDto)
43+
assert.NoError(t, err)
44+
return metricDto.GetCounter().GetValue()
45+
}
46+
47+
func TestRecordPodCount(t *testing.T) {
48+
podState := PodStateInjected
49+
count := float64(5)
50+
51+
RecordPodCount(testNamespace, testModelValidation, podState, count)
52+
53+
verifyGaugeMetric(t, ModelValidationPodCounts, []string{testNamespace, testModelValidation, podState}, count)
54+
}
55+
56+
func TestRecordPodStateTransition(t *testing.T) {
57+
fromState := PodStateUninjected
58+
toState := PodStateInjected
59+
60+
labels := []string{testNamespace, testModelValidation, fromState, toState}
61+
initialValue := verifyCounterIncrement(t, PodStateTransitionsTotal, labels)
62+
RecordPodStateTransition(testNamespace, testModelValidation, fromState, toState)
63+
finalValue := verifyCounterIncrement(t, PodStateTransitionsTotal, labels)
64+
65+
assert.Equal(t, initialValue+1, finalValue)
66+
}
67+
68+
func TestRecordStatusUpdate(t *testing.T) {
69+
result := StatusUpdateSuccess
70+
71+
labels := []string{testNamespace, testModelValidation, result}
72+
initialValue := verifyCounterIncrement(t, StatusUpdatesTotal, labels)
73+
RecordStatusUpdate(testNamespace, testModelValidation, result)
74+
finalValue := verifyCounterIncrement(t, StatusUpdatesTotal, labels)
75+
76+
assert.Equal(t, initialValue+1, finalValue)
77+
}
78+
79+
func TestRecordConfigurationDrift(t *testing.T) {
80+
driftType := "config_hash"
81+
82+
labels := []string{testNamespace, testModelValidation, driftType}
83+
initialValue := verifyCounterIncrement(t, ConfigurationDriftEventsTotal, labels)
84+
RecordConfigurationDrift(testNamespace, testModelValidation, driftType)
85+
finalValue := verifyCounterIncrement(t, ConfigurationDriftEventsTotal, labels)
86+
87+
assert.Equal(t, initialValue+1, finalValue)
88+
}
89+
90+
func TestRecordModelValidationCR(t *testing.T) {
91+
count := float64(3)
92+
93+
RecordModelValidationCR(testNamespace, count)
94+
95+
verifyGaugeMetric(t, ModelValidationCRsTotal, []string{testNamespace}, count)
96+
}
97+
98+
func TestSetQueueSize(t *testing.T) {
99+
size := float64(10)
100+
101+
SetQueueSize(size)
102+
103+
metricDto := &dto.Metric{}
104+
err := QueueSize.Write(metricDto)
105+
assert.NoError(t, err)
106+
assert.Equal(t, size, metricDto.GetGauge().GetValue())
107+
}
108+
109+
func TestRecordStatusUpdateDuration(t *testing.T) {
110+
result := StatusUpdateSuccess
111+
duration := 0.5 // 500ms
112+
113+
RecordStatusUpdateDuration(testNamespace, testModelValidation, result, duration)
114+
115+
// Verify the histogram was recorded by checking the metric family
116+
metricFamilies, err := metrics.Registry.Gather()
117+
assert.NoError(t, err)
118+
119+
var found bool
120+
for _, mf := range metricFamilies {
121+
if mf.GetName() == "model_validation_operator_status_update_duration_seconds" {
122+
for _, metric := range mf.GetMetric() {
123+
if metric.GetHistogram().GetSampleCount() > 0 {
124+
found = true
125+
break
126+
}
127+
}
128+
}
129+
}
130+
assert.True(t, found, "Expected histogram metric to be recorded")
131+
}
132+
133+
func TestRecordMultiplePodStateTransitions(t *testing.T) {
134+
fromState := PodStateUninjected
135+
toState := PodStateInjected
136+
count := 3
137+
138+
labels := []string{testNamespace, testModelValidation, fromState, toState}
139+
initialValue := verifyCounterIncrement(t, PodStateTransitionsTotal, labels)
140+
RecordMultiplePodStateTransitions(testNamespace, testModelValidation, fromState, toState, count)
141+
finalValue := verifyCounterIncrement(t, PodStateTransitionsTotal, labels)
142+
143+
assert.Equal(t, initialValue+float64(count), finalValue)
144+
}

0 commit comments

Comments
 (0)