pkg/cvo/metrics: Connect ClusterVersion to ClusterOperatorDown and ClusterOperatorDegraded

wking · wking · commit 10849d7ce36f · 2024-04-10T00:04:09.000-07:00
By adding cluster_operator_up handling for ClusterVersion, with 'version' as the component name, the same way we handle cluster_operator_conditions. This plugs us into ClusterOperatorDown (based on cluster_operator_up) and ClusterOperatorDegraded (based on both cluster_operator_conditions and cluster_operator_up). I've adjusted the ClusterOperatorDegraded rule so that it fires on ClusterVersion Failing=True and does not fire on Failing=False. Thinking through an update from before: 1. Outgoing CVO does not serve cluster_operator_up{name="version"}. 2. User requests an update to a release with this change. 3. New CVO comes in, starts serving cluster_operator_up{name="version"}. 4. Old ClusterOperatorDegraded no matching cluster_operator_conditions{name="version",condition="Degraded"}, falls through to cluster_operator_up{name="version"}, and starts cooking the 'for: 30m'. 5. If we go more than 30m before updating the ClusterOperatorDegraded rule to understand Failing, ClusterOperatorDegraded would fire. We'll need to backport the ClusterOperatorDegraded expr change to one 4.y release before the CVO-metrics change lands to get: 1. Outgoing CVO does not serve cluster_operator_up{name="version"}. 2. User requests an update to a release with the expr change. 3. Incoming ClusterOperatorDegraded sees no cluster_operator_conditions{name="version",condition="Degraded"}, cluster_operator_conditions{name="version",condition="Failing"} (we hope), or cluster_operator_up{name="version"}, so it doesn't fire. Unless we are Failing=True, in which case, hooray, we'll start alerting about it. 4. User requests an update to a release with the CVO-metrics change. 5. New CVO starts serving cluster_operator_up, just like the fresh-modern-install situation, and everything is great. The missing-ClusterVersion metrics don't matter all that much today, because the CVO has been creating replacement ClusterVersion since at least 90e9881 (cvo: Change the core CVO loops to report status to ClusterVersion, 2018-11-02, #45). But it will become more important with [1], which is planning on removing that default creation. When there is no ClusterVersion, we expect ClusterOperatorDown to fire. The awkward: {{ "{{ ... \"version\" }} ... {{ end }}" }} business is because this content is unpacked in two rounds of templating: 1. The cluster-version operator's getPayloadTasks' renderManifest preprocessing for the CVO directory, which is based on Go templates. 2. Prometheus alerting-rule templates, which use console templates [2], which are also based on Go templates [3]. The '{{ "..." }}' wrapping is consumed by the CVO's templating, and the remaining: {{ ... "version" }} ... {{ end }} is left for Promtheus' templating. [1]: #741 [2]: https://prometheus.io/docs/prometheus/2.51/configuration/alerting_rules/#templating [3]: https://prometheus.io/docs/visualization/consoles/
diff --git a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml
@@ -96,7 +96,7 @@ spec:
     - alert: ClusterOperatorDown
       annotations:
         summary: Cluster operator has not been available for 10 minutes.
-        description: The {{ "{{ $labels.name }}" }} operator may be down or disabled because {{ "{{ $labels.reason }}" }}, and the components it manages may be unavailable or degraded.  Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
+        description: The {{ "{{ $labels.name }}" }} operator may be down or disabled because {{ "{{ $labels.reason }}" }}, and the components it manages may be unavailable or degraded.  Cluster upgrades may not complete. For more information refer to '{{ "{{ if eq $labels.name \"version\" }}oc adm upgrade{{ else }}oc get -o yaml clusteroperator {{ $labels.name }}{{ end }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
         runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md
       expr: |
         max by (namespace, name, reason) (cluster_operator_up{job="cluster-version-operator"} == 0)
@@ -106,13 +106,15 @@ spec:
     - alert: ClusterOperatorDegraded
       annotations:
         summary: Cluster operator has been degraded for 30 minutes.
-        description: The {{ "{{ $labels.name }}" }} operator is degraded because {{ "{{ $labels.reason }}" }}, and the components it manages may have reduced quality of service.  Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
+        description: The {{ "{{ $labels.name }}" }} operator is degraded because {{ "{{ $labels.reason }}" }}, and the components it manages may have reduced quality of service.  Cluster upgrades may not complete. For more information refer to '{{ "{{ if eq $labels.name \"version\" }}oc adm upgrade{{ else }}oc get -o yaml clusteroperator {{ $labels.name }}{{ end }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
         runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md
       expr: |
         max by (namespace, name, reason)
         (
           (
-            cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"}
+            cluster_operator_conditions{job="cluster-version-operator", name!="version", condition="Degraded"}
+            or on (namespace, name)
+            cluster_operator_conditions{job="cluster-version-operator", name="version", condition="Failing"}
             or on (namespace, name)
             group by (namespace, name) (cluster_operator_up{job="cluster-version-operator"})
           ) == 1
@@ -123,7 +125,7 @@ spec:
     - alert: ClusterOperatorFlapping
       annotations:
         summary: Cluster operator up status is changing often.
-        description: The  {{ "{{ $labels.name }}" }} operator behavior might cause upgrades to be unstable. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
+        description: The  {{ "{{ $labels.name }}" }} operator behavior might cause upgrades to be unstable. For more information refer to '{{ "{{ if eq $labels.name \"version\" }}oc adm upgrade{{ else }}oc get -o yaml clusteroperator {{ $labels.name }}{{ end }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
       expr: |
         max by (namespace, name) (changes(cluster_operator_up{job="cluster-version-operator"}[2m]) > 2)
       for: 10m
diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go
@@ -374,7 +374,16 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
 	current := m.optr.currentVersion()
 	var completed configv1.UpdateHistory
 
-	if cv, err := m.optr.cvLister.Get(m.optr.name); err == nil {
+	if cv, err := m.optr.cvLister.Get(m.optr.name); apierrors.IsNotFound(err) {
+		g := m.clusterOperatorUp.WithLabelValues("version", "", "ClusterVersionNotFound")
+		g.Set(0)
+		ch <- g
+
+		g = m.clusterOperatorConditions.WithLabelValues("version", string(configv1.OperatorAvailable), "ClusterVersionNotFound")
+		g.Set(0)
+		ch <- g
+	} else if err == nil {
+
 		// output cluster version
 
 		var initial configv1.UpdateHistory
@@ -484,7 +493,18 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
 				klog.V(2).Infof("skipping metrics for ClusterVersion condition %s=%s (neither True nor False)", condition.Type, condition.Status)
 				continue
 			}
-			g := m.clusterOperatorConditions.WithLabelValues("version", string(condition.Type), string(condition.Reason))
+
+			if condition.Type == configv1.OperatorAvailable {
+				g = m.clusterOperatorUp.WithLabelValues("version", completed.Version, string(condition.Reason))
+				if condition.Status == configv1.ConditionTrue {
+					g.Set(1)
+				} else {
+					g.Set(0)
+				}
+				ch <- g
+			}
+
+			g = m.clusterOperatorConditions.WithLabelValues("version", string(condition.Type), string(condition.Reason))
 			if condition.Status == configv1.ConditionTrue {
 				g.Set(1)
 			} else {
diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go