Skip to content

Commit 5e73deb

Browse files
Merge pull request #746 from wking/metrics-for-no-cluster-version
OCPBUGS-9133: pkg/cvo/metrics: Connect ClusterVersion to ClusterOperatorDown and ClusterOperatorDegraded
2 parents debaaf6 + 10849d7 commit 5e73deb

File tree

4 files changed

+103
-55
lines changed

4 files changed

+103
-55
lines changed

install/0000_90_cluster-version-operator_02_servicemonitor.yaml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ spec:
9696
- alert: ClusterOperatorDown
9797
annotations:
9898
summary: Cluster operator has not been available for 10 minutes.
99-
description: The {{ "{{ $labels.name }}" }} operator may be down or disabled because {{ "{{ $labels.reason }}" }}, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
99+
description: The {{ "{{ $labels.name }}" }} operator may be down or disabled because {{ "{{ $labels.reason }}" }}, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to '{{ "{{ if eq $labels.name \"version\" }}oc adm upgrade{{ else }}oc get -o yaml clusteroperator {{ $labels.name }}{{ end }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
100100
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md
101101
expr: |
102102
max by (namespace, name, reason) (cluster_operator_up{job="cluster-version-operator"} == 0)
@@ -106,13 +106,15 @@ spec:
106106
- alert: ClusterOperatorDegraded
107107
annotations:
108108
summary: Cluster operator has been degraded for 30 minutes.
109-
description: The {{ "{{ $labels.name }}" }} operator is degraded because {{ "{{ $labels.reason }}" }}, and the components it manages may have reduced quality of service. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
109+
description: The {{ "{{ $labels.name }}" }} operator is degraded because {{ "{{ $labels.reason }}" }}, and the components it manages may have reduced quality of service. Cluster upgrades may not complete. For more information refer to '{{ "{{ if eq $labels.name \"version\" }}oc adm upgrade{{ else }}oc get -o yaml clusteroperator {{ $labels.name }}{{ end }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
110110
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md
111111
expr: |
112112
max by (namespace, name, reason)
113113
(
114114
(
115-
cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"}
115+
cluster_operator_conditions{job="cluster-version-operator", name!="version", condition="Degraded"}
116+
or on (namespace, name)
117+
cluster_operator_conditions{job="cluster-version-operator", name="version", condition="Failing"}
116118
or on (namespace, name)
117119
group by (namespace, name) (cluster_operator_up{job="cluster-version-operator"})
118120
) == 1
@@ -123,7 +125,7 @@ spec:
123125
- alert: ClusterOperatorFlapping
124126
annotations:
125127
summary: Cluster operator up status is changing often.
126-
description: The {{ "{{ $labels.name }}" }} operator behavior might cause upgrades to be unstable. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
128+
description: The {{ "{{ $labels.name }}" }} operator behavior might cause upgrades to be unstable. For more information refer to '{{ "{{ if eq $labels.name \"version\" }}oc adm upgrade{{ else }}oc get -o yaml clusteroperator {{ $labels.name }}{{ end }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
127129
expr: |
128130
max by (namespace, name) (changes(cluster_operator_up{job="cluster-version-operator"}[2m]) > 2)
129131
for: 10m

pkg/cvo/metrics.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,16 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
374374
current := m.optr.currentVersion()
375375
var completed configv1.UpdateHistory
376376

377-
if cv, err := m.optr.cvLister.Get(m.optr.name); err == nil {
377+
if cv, err := m.optr.cvLister.Get(m.optr.name); apierrors.IsNotFound(err) {
378+
g := m.clusterOperatorUp.WithLabelValues("version", "", "ClusterVersionNotFound")
379+
g.Set(0)
380+
ch <- g
381+
382+
g = m.clusterOperatorConditions.WithLabelValues("version", string(configv1.OperatorAvailable), "ClusterVersionNotFound")
383+
g.Set(0)
384+
ch <- g
385+
} else if err == nil {
386+
378387
// output cluster version
379388

380389
var initial configv1.UpdateHistory
@@ -484,7 +493,18 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
484493
klog.V(2).Infof("skipping metrics for ClusterVersion condition %s=%s (neither True nor False)", condition.Type, condition.Status)
485494
continue
486495
}
487-
g := m.clusterOperatorConditions.WithLabelValues("version", string(condition.Type), string(condition.Reason))
496+
497+
if condition.Type == configv1.OperatorAvailable {
498+
g = m.clusterOperatorUp.WithLabelValues("version", completed.Version, string(condition.Reason))
499+
if condition.Status == configv1.ConditionTrue {
500+
g.Set(1)
501+
} else {
502+
g.Set(0)
503+
}
504+
ch <- g
505+
}
506+
507+
g = m.clusterOperatorConditions.WithLabelValues("version", string(condition.Type), string(condition.Reason))
488508
if condition.Status == configv1.ConditionTrue {
489509
g.Set(1)
490510
} else {

0 commit comments

Comments
 (0)