Skip to content

Commit 6861c48

Browse files
committed
pkg/cvo/metrics: Report ClusterVersion conditions with reasons
This will allow us to discover upgrade and other failure reasons without having to resort to a must-gather or similar [1]. And also to look at any other version conditions in Telemetry. Stick this in cluster_operator_conditions, since we already have a 'reason' slot there. And ClusterVersion.Status.Conditions is pretty much the same thing as ClusterOperator.Status.Conditions; we'll want to see all of those. I don't see a reason to add a new metric to separate cluster-version operator failures from second-level operator failures; the name should be sufficient for that. [1]: https://bugzilla.redhat.com/show_bug.cgi?id=1741645
1 parent 015b9df commit 6861c48

File tree

2 files changed

+34
-16
lines changed

2 files changed

+34
-16
lines changed

pkg/cvo/metrics.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,19 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
227227
g.Set(float64(len(cv.Status.AvailableUpdates)))
228228
ch <- g
229229
}
230+
231+
for _, condition := range cv.Status.Conditions {
232+
if condition.Status == configv1.ConditionUnknown {
233+
continue
234+
}
235+
g := m.clusterOperatorConditions.WithLabelValues("version", string(condition.Type), string(condition.Reason))
236+
if condition.Status == configv1.ConditionTrue {
237+
g.Set(1)
238+
} else {
239+
g.Set(0)
240+
}
241+
ch <- g
242+
}
230243
}
231244

232245
g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version)

pkg/cvo/metrics_test.go

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -269,22 +269,24 @@ func Test_operatorMetrics_Collect(t *testing.T) {
269269
},
270270
Status: configv1.ClusterVersionStatus{
271271
Conditions: []configv1.ClusterOperatorStatusCondition{
272-
{Type: configv1.RetrievedUpdates, Status: configv1.ConditionTrue},
272+
{Type: configv1.RetrievedUpdates, Status: configv1.ConditionTrue, Reason: "Because stuff"},
273273
},
274274
},
275275
},
276276
},
277277
},
278278
},
279279
wants: func(t *testing.T, metrics []prometheus.Metric) {
280-
if len(metrics) != 5 {
280+
if len(metrics) != 6 {
281281
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
282282
}
283283
expectMetric(t, metrics[0], 2, map[string]string{"type": "initial", "version": "", "image": "", "from_version": ""})
284284
expectMetric(t, metrics[1], 2, map[string]string{"type": "cluster", "version": "", "image": "", "from_version": ""})
285+
285286
expectMetric(t, metrics[2], 0, map[string]string{"upstream": "<default>", "channel": ""})
286-
expectMetric(t, metrics[3], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""})
287-
expectMetric(t, metrics[4], 1, map[string]string{"type": ""})
287+
expectMetric(t, metrics[3], 1, map[string]string{"name": "version", "condition": "RetrievedUpdates", "reason": "Because stuff"})
288+
expectMetric(t, metrics[4], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""})
289+
expectMetric(t, metrics[5], 1, map[string]string{"type": ""})
288290
},
289291
},
290292
{
@@ -305,22 +307,23 @@ func Test_operatorMetrics_Collect(t *testing.T) {
305307
},
306308
Status: configv1.ClusterVersionStatus{
307309
Conditions: []configv1.ClusterOperatorStatusCondition{
308-
{Type: configv1.OperatorAvailable, Status: configv1.ConditionTrue, LastTransitionTime: metav1.Time{Time: time.Unix(5, 0)}},
310+
{Type: configv1.OperatorAvailable, Status: configv1.ConditionTrue, LastTransitionTime: metav1.Time{Time: time.Unix(5, 0)}, Reason: "Because stuff"},
309311
},
310312
},
311313
},
312314
},
313315
},
314316
},
315317
wants: func(t *testing.T, metrics []prometheus.Metric) {
316-
if len(metrics) != 5 {
318+
if len(metrics) != 6 {
317319
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
318320
}
319321
expectMetric(t, metrics[0], 2, map[string]string{"type": "initial", "version": "", "image": "", "from_version": ""})
320322
expectMetric(t, metrics[1], 2, map[string]string{"type": "cluster", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
321323
expectMetric(t, metrics[2], 5, map[string]string{"type": "desired", "version": "1.0.0", "image": "test/image:2", "from_version": ""})
322-
expectMetric(t, metrics[3], 0, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
323-
expectMetric(t, metrics[4], 1, map[string]string{"type": ""})
324+
expectMetric(t, metrics[3], 1, map[string]string{"name": "version", "condition": "Available", "reason": "Because stuff"})
325+
expectMetric(t, metrics[4], 0, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
326+
expectMetric(t, metrics[5], 1, map[string]string{"type": ""})
324327
},
325328
},
326329
{
@@ -342,24 +345,25 @@ func Test_operatorMetrics_Collect(t *testing.T) {
342345
},
343346
Status: configv1.ClusterVersionStatus{
344347
Conditions: []configv1.ClusterOperatorStatusCondition{
345-
{Type: ClusterStatusFailing, Status: configv1.ConditionTrue, LastTransitionTime: metav1.Time{Time: time.Unix(4, 0)}},
348+
{Type: ClusterStatusFailing, Status: configv1.ConditionTrue, LastTransitionTime: metav1.Time{Time: time.Unix(4, 0)}, Reason: "Because stuff"},
346349
},
347350
},
348351
},
349352
},
350353
},
351354
},
352355
wants: func(t *testing.T, metrics []prometheus.Metric) {
353-
if len(metrics) != 7 {
356+
if len(metrics) != 8 {
354357
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
355358
}
356359
expectMetric(t, metrics[0], 5, map[string]string{"type": "initial", "version": "", "image": "", "from_version": ""})
357360
expectMetric(t, metrics[1], 5, map[string]string{"type": "cluster", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
358361
expectMetric(t, metrics[2], 5, map[string]string{"type": "desired", "version": "1.0.0", "image": "test/image:2", "from_version": ""})
359362
expectMetric(t, metrics[3], 4, map[string]string{"type": "failure", "version": "1.0.0", "image": "test/image:2", "from_version": ""})
360363
expectMetric(t, metrics[4], 4, map[string]string{"type": "failure", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
361-
expectMetric(t, metrics[5], 6, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
362-
expectMetric(t, metrics[6], 1, map[string]string{"type": ""})
364+
expectMetric(t, metrics[5], 1, map[string]string{"name": "version", "condition": "Failing", "reason": "Because stuff"})
365+
expectMetric(t, metrics[6], 6, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
366+
expectMetric(t, metrics[7], 1, map[string]string{"type": ""})
363367
},
364368
},
365369
{
@@ -377,22 +381,23 @@ func Test_operatorMetrics_Collect(t *testing.T) {
377381
},
378382
Status: configv1.ClusterVersionStatus{
379383
Conditions: []configv1.ClusterOperatorStatusCondition{
380-
{Type: ClusterStatusFailing, Status: configv1.ConditionTrue},
384+
{Type: ClusterStatusFailing, Status: configv1.ConditionTrue, Reason: "Because stuff"},
381385
},
382386
},
383387
},
384388
},
385389
},
386390
},
387391
wants: func(t *testing.T, metrics []prometheus.Metric) {
388-
if len(metrics) != 5 {
392+
if len(metrics) != 6 {
389393
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
390394
}
391395
expectMetric(t, metrics[0], 2, map[string]string{"type": "initial", "version": "", "image": "", "from_version": ""})
392396
expectMetric(t, metrics[1], 2, map[string]string{"type": "cluster", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
393397
expectMetric(t, metrics[2], 0, map[string]string{"type": "failure", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
394-
expectMetric(t, metrics[3], 0, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
395-
expectMetric(t, metrics[4], 1, map[string]string{"type": ""})
398+
expectMetric(t, metrics[3], 1, map[string]string{"name": "version", "condition": "Failing", "reason": "Because stuff"})
399+
expectMetric(t, metrics[4], 0, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
400+
expectMetric(t, metrics[5], 1, map[string]string{"type": ""})
396401
},
397402
},
398403
{

0 commit comments

Comments
 (0)