Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ kube_pod_status_phase{namespace="default", pod="nginx", phase="Failed"} 0

We adopt the same pattern for controller Conditions, but we export only one time series per (status, reason) variant,
meaning we delete all other variants in the group when we set the metric, ensuring the cardinality stays under control.
Additionally, rather than return 1/0 indicating the activeness of the metric, we set the last transition time of the
condition as the value (unix timestamp).

Example metric:

Expand All @@ -146,12 +148,13 @@ operator_controller_condition{
condition="Ready",
status="False",
reason="FailedToProvision"
} 1
} 17591743210
```

- **Index**: controller, resource_kind, resource_name, resource_namespace
- **Group**: condition
- **Extra**: status, reason
- **Metric Value**: Unix timestamp of last transition of given condition

### Initialization

Expand Down Expand Up @@ -223,10 +226,12 @@ const (
)

// SetStatusCondition utility function which replaces and wraps meta.SetStatusCondition calls
func (r *MyReconciler) SetStatusCondition(cr *v1.MyCR, condition metav1.Condition) bool {
changed := meta.SetStatusCondition(&cr.Status.Conditions, condition)
func (r *MyReconciler) SetStatusCondition(cr *v1.MyCR, cond metav1.Condition) bool {
changed := meta.SetStatusCondition(&cr.Status.Conditions, cond)
if changed {
r.Recorder.RecordConditionFor(kind, cr, condition.Type, string(condition.Status), condition.Reason)
r.Recorder.RecordConditionFor(
kind, cr, cond.Type, string(cond.Status), cond.Reason, cond.LastTransitionTime,
)
}
return changed
}
Expand Down
28 changes: 19 additions & 9 deletions pkg/operator_condition_metrics/operator_condition_metrics.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package operator_condition_metrics

import (
"time"

metrics "github.com/sourcehawk/go-prometheus-gaugevecset/pkg/gauge_vec_set"
)

Expand All @@ -15,7 +17,8 @@ and marking exactly one as active (1) while the others are inactive (0). Example
kube_pod_status_phase{namespace="default", pod="nginx", phase="Failed"} 0

We adopt the same pattern for controller Conditions, but we export one time series per (status, reason) variant
and enforce **exclusivity per condition**.
and enforce **exclusivity per condition**. The value of the metric we set is also the last transition time of the
condition.

For any given (controller, kind, name, namespace, condition) exactly one (status, reason) series is present at a time.
All other variants are **deleted**. This keeps cardinality under control.
Expand All @@ -33,7 +36,7 @@ Labels (order matches registration)
- reason: short machine-typed reason (often "" when status="True")

Value
- Always 1 for the single active (status, reason) series in the group.
- The timestamp of last transition time for the condition

Examples:

Expand All @@ -47,7 +50,7 @@ Examples:
condition="Ready",
status="True",
reason=""
} 1
} 1759174202

(Other status/reason variants for this condition are removed.)

Expand All @@ -60,7 +63,7 @@ Examples:
condition="Ready",
status="False",
reason="Failed"
} 1
} 1759174205

3. Another condition can be active simultaneously (different group):

Expand All @@ -69,7 +72,7 @@ Examples:
condition="Synchronized",
status="True",
reason=""
} 1
} 17591743210

Cleanup
When the resource is deleted/pruned, all series for its index key
Expand Down Expand Up @@ -157,7 +160,9 @@ type ConditionMetricRecorder struct {
// RecordConditionFor sets a condition metric for a given controller and object.
//
// It enforces exclusivity within the same (controller, name, namespace, condition) group,
// ensuring that only the latest status (True/False/Unknown) is present for a given condition type.
// ensuring that only the latest (status, phase) is present for a given condition type.
//
// If the lastTransitionTime is zero, the value of the metric is set to the unix timestamp for time.Now().UTC()
//
// The following label values are set:
//
Expand All @@ -171,15 +176,20 @@ type ConditionMetricRecorder struct {
//
// Example:
//
// r.RecordConditionFor(kind, obj, "Ready", "True", "AppReady")
// r.RecordConditionFor(kind, obj, "Ready", "True", "AppReady", lastTransitionTime)
func (r *ConditionMetricRecorder) RecordConditionFor(
kind string, object ObjectLike, conditionType, conditionStatus, conditionReason string,
kind string, object ObjectLike,
conditionType, conditionStatus, conditionReason string, lastTransitionTime time.Time,
) {
indexValues := []string{r.Controller, kind, object.GetName(), object.GetNamespace()}
groupValues := []string{conditionType}
extraValues := []string{conditionStatus, conditionReason}

r.OperatorConditionsGauge.SetGroup(1, indexValues, groupValues, extraValues...)
if lastTransitionTime.IsZero() {
lastTransitionTime = time.Now().UTC()
}

r.OperatorConditionsGauge.SetGroup(float64(lastTransitionTime.Unix()), indexValues, groupValues, extraValues...)
}

// RemoveConditionsFor deletes all condition metrics for a given resource.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bytes"
"fmt"
"testing"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/expfmt"
Expand Down Expand Up @@ -65,6 +66,7 @@ func createBenchmarkScenario(tb testing.TB, registry *prometheus.Registry) *Cond
}

obj := &FakeObject{}
transitionTime := time.Now().UTC()

condition := &FakeCondition{
Status: "True", // doesn't matter, cardinality decided by Reason
Expand All @@ -82,7 +84,7 @@ func createBenchmarkScenario(tb testing.TB, registry *prometheus.Registry) *Cond

for v := 0; v < variantsPerCondition; v++ {
condition.Reason = generatedName("variant", v)
rec.RecordConditionFor(kind, obj, condition.Type, condition.Reason, condition.Reason)
rec.RecordConditionFor(kind, obj, condition.Type, condition.Reason, condition.Reason, transitionTime)
}
}
}
Expand All @@ -106,6 +108,7 @@ func Benchmark_ConditionMetricsRecorder_TimePerCall(b *testing.B) {
Name: "Resource0",
Namespace: "namespace0",
}
transitionTime := time.Now().UTC()

// Two variants in the same (controller,kind,name,namespace,condition) group.
condTrue := &FakeCondition{
Expand All @@ -126,9 +129,9 @@ func Benchmark_ConditionMetricsRecorder_TimePerCall(b *testing.B) {
for i := 0; i < b.N; i++ {
// Flip between two variants
if (i & 1) == 0 {
rec.RecordConditionFor(kind, obj, condTrue.Type, condTrue.Status, condTrue.Reason)
rec.RecordConditionFor(kind, obj, condTrue.Type, condTrue.Status, condTrue.Reason, transitionTime)
} else {
rec.RecordConditionFor(kind, obj, condFalse.Type, condFalse.Status, condFalse.Reason)
rec.RecordConditionFor(kind, obj, condFalse.Type, condFalse.Status, condFalse.Reason, transitionTime)
}
}
})
Expand All @@ -140,7 +143,7 @@ func Benchmark_ConditionMetricsRecorder_TimePerCall(b *testing.B) {
for i := 0; i < b.N; i++ {
// Ensure there is something to remove, but do not count the set time.
b.StopTimer()
rec.RecordConditionFor(kind, obj, condTrue.Type, condTrue.Status, condTrue.Reason)
rec.RecordConditionFor(kind, obj, condTrue.Type, condTrue.Status, condTrue.Reason, transitionTime)
b.StartTimer()

rec.RemoveConditionsFor(kind, obj)
Expand Down
23 changes: 14 additions & 9 deletions pkg/operator_condition_metrics/operator_condition_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package operator_condition_metrics
import (
"strings"
"testing"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
Expand Down Expand Up @@ -30,23 +31,25 @@ func TestConditionMetricRecorder_Record_Transition_And_SecondCondition(t *testin
kind := "MyCRD"
name := "cr-1"
ns := "prod"
transitionTime := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC)

obj := makeObj(name, ns)

// Record Ready=True
rec.RecordConditionFor(kind, obj, "Ready", "True", "")
rec.RecordConditionFor(kind, obj, "Ready", "True", "", transitionTime)

// Flip Ready -> False with reason
rec.RecordConditionFor(kind, obj, "Ready", "False", "Failed")
rec.RecordConditionFor(kind, obj, "Ready", "False", "Failed", transitionTime)

// Another condition Synchronized=True (independent group)
rec.RecordConditionFor(kind, obj, "Synchronized", "True", "")
rec.RecordConditionFor(kind, obj, "Synchronized", "True", "", transitionTime)

// Expect: Ready False(reason)=1, Synchronized True=1
want := `
# HELP test_record_transition_and_second_condition_controller_condition Condition status for a custom resource; one active (status,reason) time series per (controller,kind,name,namespace,condition).
# TYPE test_record_transition_and_second_condition_controller_condition gauge
test_record_transition_and_second_condition_controller_condition{condition="Ready",controller="my-controller",reason="Failed",resource_kind="MyCRD",resource_name="cr-1",resource_namespace="prod",status="False"} 1
test_record_transition_and_second_condition_controller_condition{condition="Synchronized",controller="my-controller",reason="",resource_kind="MyCRD",resource_name="cr-1",resource_namespace="prod",status="True",} 1
test_record_transition_and_second_condition_controller_condition{condition="Ready",controller="my-controller",reason="Failed",resource_kind="MyCRD",resource_name="cr-1",resource_namespace="prod",status="False"} 1735689600
test_record_transition_and_second_condition_controller_condition{condition="Synchronized",controller="my-controller",reason="",resource_kind="MyCRD",resource_name="cr-1",resource_namespace="prod",status="True",} 1735689600
`
require.NoError(t,
testutil.GatherAndCompare(
Expand All @@ -72,10 +75,11 @@ func TestConditionMetricRecorder_RemoveConditionsFor(t *testing.T) {
kind := "MyCRD"
name := "cr-2"
ns := "staging"
transitionTime := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC)
obj := makeObj(name, ns)

rec.RecordConditionFor(kind, obj, "Ready", "True", "")
rec.RecordConditionFor(kind, obj, "Synchronized", "False", "SyncPending")
rec.RecordConditionFor(kind, obj, "Ready", "True", "", transitionTime)
rec.RecordConditionFor(kind, obj, "Synchronized", "False", "SyncPending", transitionTime)

// Remove all condition series for this object
removed := rec.RemoveConditionsFor(kind, obj)
Expand Down Expand Up @@ -103,16 +107,17 @@ func TestConditionMetricRecorder_SetsKindLabelFromObject(t *testing.T) {
kind := "FancyKind"
name := "obj-1"
ns := "ns-1"
transitionTime := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC)
obj := makeObj(name, ns)

// Record a condition
rec.RecordConditionFor(kind, obj, "Ready", "True", "")
rec.RecordConditionFor(kind, obj, "Ready", "True", "", transitionTime)

// Expect the 'kind' label to reflect the object's Kind
want := `
# HELP test_sets_kind_label_from_object_controller_condition Condition status for a custom resource; one active (status,reason) time series per (controller,kind,name,namespace,condition).
# TYPE test_sets_kind_label_from_object_controller_condition gauge
test_sets_kind_label_from_object_controller_condition{condition="Ready",controller="my-controller",reason="",resource_kind="FancyKind",resource_name="obj-1",resource_namespace="ns-1",status="True"} 1
test_sets_kind_label_from_object_controller_condition{condition="Ready",controller="my-controller",reason="",resource_kind="FancyKind",resource_name="obj-1",resource_namespace="ns-1",status="True"} 1735689600
`
require.NoError(t,
testutil.GatherAndCompare(
Expand Down