Skip to content

Commit c146577

Browse files
metrics: Stop double reporting "failure" cluster_version
We currently report failure twice. Now that we have from_version, we can report the previous completed version.
1 parent e47c778 commit c146577

File tree

3 files changed

+23
-22
lines changed

3 files changed

+23
-22
lines changed

docs/dev/metrics.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ The cluster version is reported as seconds since the epoch with labels for `vers
88
* `current` - the version the operator is applying right now (the running CVO version) and the age of the payload
99
* `initial` - the initial version of the cluster, and value is the creation timestamp of the cluster version (cluster age)
1010
* `cluster` - the current version of the cluster with from_version set to the initial version, and value is the creation timestamp of the cluster version (cluster age)
11-
* `failure` - if the failure condition is set, reports the last transition time for both desired and current versions
11+
* `failure` - if the failure condition is set, reports the last transition time for the condition.
1212
* `desired` - reported if different from current as the most recent timestamp on the cluster version
1313
* `completed` - the time the most recent version was completely applied, or absent if not reached
1414
* `updating` - if the operator is moving to a new version, the time the update started

pkg/cvo/metrics.go

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@ package cvo
33
import (
44
"time"
55

6-
apierrors "k8s.io/apimachinery/pkg/api/errors"
7-
"k8s.io/apimachinery/pkg/util/sets"
86
"github.com/prometheus/client_golang/prometheus"
7+
apierrors "k8s.io/apimachinery/pkg/api/errors"
98
"k8s.io/apimachinery/pkg/labels"
9+
"k8s.io/apimachinery/pkg/util/sets"
1010
"k8s.io/client-go/tools/cache"
1111

1212
configv1 "github.com/openshift/api/config/v1"
@@ -181,23 +181,21 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
181181
g.Set(float64(cv.CreationTimestamp.Unix()))
182182
ch <- g
183183

184-
failing := resourcemerge.FindOperatorStatusCondition(cv.Status.Conditions, ClusterStatusFailing)
184+
// answers "is there a desired update we have not yet satisfied"
185185
if update := cv.Spec.DesiredUpdate; update != nil && update.Image != current.Image {
186-
g := m.version.WithLabelValues("desired", update.Version, update.Image, completed.Version)
186+
g = m.version.WithLabelValues("desired", update.Version, update.Image, completed.Version)
187187
g.Set(float64(mostRecentTimestamp(cv)))
188188
ch <- g
189-
if failing != nil && failing.Status == configv1.ConditionTrue {
190-
g = m.version.WithLabelValues("failure", update.Version, update.Image, completed.Version)
191-
if failing.LastTransitionTime.IsZero() {
192-
g.Set(0)
193-
} else {
194-
g.Set(float64(failing.LastTransitionTime.Unix()))
195-
}
196-
ch <- g
197-
}
198189
}
190+
191+
// answers "if we are failing, are we updating or reconciling"
192+
failing := resourcemerge.FindOperatorStatusCondition(cv.Status.Conditions, ClusterStatusFailing)
199193
if failing != nil && failing.Status == configv1.ConditionTrue {
200-
g := m.version.WithLabelValues("failure", current.Version, current.Image, completed.Version)
194+
if update := cv.Spec.DesiredUpdate; update != nil && update.Image != current.Image {
195+
g = m.version.WithLabelValues("failure", update.Version, update.Image, completed.Version)
196+
} else {
197+
g = m.version.WithLabelValues("failure", current.Version, current.Image, completed.Version)
198+
}
201199
if failing.LastTransitionTime.IsZero() {
202200
g.Set(0)
203201
} else {

pkg/cvo/metrics_test.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,9 @@ func Test_operatorMetrics_Collect(t *testing.T) {
344344
DesiredUpdate: &configv1.Update{Version: "1.0.0", Image: "test/image:2"},
345345
},
346346
Status: configv1.ClusterVersionStatus{
347+
History: []configv1.UpdateHistory{
348+
{State: configv1.CompletedUpdate, Version: "0.0.2", Image: "test/image:1", CompletionTime: &([]metav1.Time{{Time: time.Unix(2, 0)}}[0])},
349+
},
347350
Conditions: []configv1.ClusterOperatorStatusCondition{
348351
{Type: ClusterStatusFailing, Status: configv1.ConditionTrue, LastTransitionTime: metav1.Time{Time: time.Unix(4, 0)}, Reason: "Because stuff"},
349352
},
@@ -356,13 +359,13 @@ func Test_operatorMetrics_Collect(t *testing.T) {
356359
if len(metrics) != 8 {
357360
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
358361
}
359-
expectMetric(t, metrics[0], 5, map[string]string{"type": "initial", "version": "", "image": "", "from_version": ""})
360-
expectMetric(t, metrics[1], 5, map[string]string{"type": "cluster", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
361-
expectMetric(t, metrics[2], 5, map[string]string{"type": "desired", "version": "1.0.0", "image": "test/image:2", "from_version": ""})
362-
expectMetric(t, metrics[3], 4, map[string]string{"type": "failure", "version": "1.0.0", "image": "test/image:2", "from_version": ""})
363-
expectMetric(t, metrics[4], 4, map[string]string{"type": "failure", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
362+
expectMetric(t, metrics[0], 2, map[string]string{"type": "completed", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
363+
expectMetric(t, metrics[1], 5, map[string]string{"type": "initial", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
364+
expectMetric(t, metrics[2], 5, map[string]string{"type": "cluster", "version": "0.0.2", "image": "test/image:1", "from_version": "0.0.2"})
365+
expectMetric(t, metrics[3], 5, map[string]string{"type": "desired", "version": "1.0.0", "image": "test/image:2", "from_version": "0.0.2"})
366+
expectMetric(t, metrics[4], 4, map[string]string{"type": "failure", "version": "1.0.0", "image": "test/image:2", "from_version": "0.0.2"})
364367
expectMetric(t, metrics[5], 1, map[string]string{"name": "version", "condition": "Failing", "reason": "Because stuff"})
365-
expectMetric(t, metrics[6], 6, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
368+
expectMetric(t, metrics[6], 6, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": "0.0.2"})
366369
expectMetric(t, metrics[7], 1, map[string]string{"type": ""})
367370
},
368371
},
@@ -409,7 +412,7 @@ func Test_operatorMetrics_Collect(t *testing.T) {
409412
cmConfigLister: &cmConfigLister{
410413
Items: []*corev1.ConfigMap{{
411414
ObjectMeta: metav1.ObjectMeta{Name: "openshift-install"},
412-
Data: map[string]string {"version": "v0.0.2", "invoker": "jane"},
415+
Data: map[string]string{"version": "v0.0.2", "invoker": "jane"},
413416
}},
414417
},
415418
},

0 commit comments

Comments
 (0)