Skip to content

Commit 1005398

Browse files
committed
Extend CVO alerts to cover update retrieval
Added metric cluster_version_operator_update_retrieval_timestamp_seconds to track last known successful update retrieval time and use metric cluster_operator_conditions to determine update retrieval failure reason. Added alert CannotRetrieveUpdates that fires when metric cluster_version_operator_update_retrieval_timestamp_seconds is >= 3600 unless reason, as reported by cluster_operator_conditions is NoChannel. Alert reports last known successful update retrieval time, reason unable to retrieve updates, and console URL to get more information.
1 parent fc25a6f commit 1005398

File tree

5 files changed

+37
-11
lines changed

5 files changed

+37
-11
lines changed

docs/user/status.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ This document describes those conditions and, where appropriate, suggests possib
88
When `RetrievedUpdates` is `True`, the CVO is succesfully retrieving updates, which is good.
99
When `RetrievedUpdates` is `False`, `reason` will be set to explain why, as discussed in the following subsections.
1010
In all cases, the impact is that the cluster will not be able to retrieve recommended updates, so cluster admins will need to monitor for available updates on their own or risk falling behind on security or other bugfixes.
11+
When CVO is unable to retrieve recommended updates the CannotRetrieveUpdates alert will fire containing the reason. This alert will not fire when the reason updates cannot be retrieved is NoChannel.
1112

1213
### NoUpstream
1314

install/0000_90_cluster-version-operator_02_servicemonitor.yaml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,19 +44,26 @@ spec:
4444
for: 10m
4545
labels:
4646
severity: critical
47+
- alert: CannotRetrieveUpdates
48+
annotations:
49+
message: Cluster version operator has not retrieved updates in {{ "{{ $value | humanizeDuration }}" }}. Failure reason {{ "{{ with $cluster_operator_conditions := \"cluster_operator_conditions\" | query}}{{range $value := .}}{{if and (eq (label \"name\" $value) \"version\") (eq (label \"condition\" $value) \"RetrievedUpdates\") (eq (label \"endpoint\" $value) \"metrics\") (eq (value $value) 0.0)}}{{label \"reason\" $value}} {{end}}{{end}}{{end}}" }}. {{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} For more information refer to {{ label \"url\" (first $console_url ) }}/settings/cluster/.{{ end }}{{ end }}" }}
50+
expr: |
51+
(time()-cluster_version_operator_update_retrieval_timestamp_seconds) >= 3600 and ignoring(condition, name, reason) cluster_operator_conditions{name="version", condition="RetrievedUpdates", endpoint="metrics", reason!="NoChannel"}
52+
labels:
53+
severity: critical
4754
- name: cluster-operators
4855
rules:
4956
- alert: ClusterOperatorDown
5057
annotations:
51-
message: Cluster operator {{ "{{ $labels.name }}" }} has not been available for 10 mins. Operator may be down or disabled, cluster will not be kept up to date and upgrades will not be possible.
58+
message: Cluster operator {{ "{{ $labels.name }}" }} has not been available for 10 minutes. Operator may be down or disabled, cluster will not be kept up to date and upgrades will not be possible.
5259
expr: |
5360
cluster_operator_up{job="cluster-version-operator"} == 0
5461
for: 10m
5562
labels:
5663
severity: critical
5764
- alert: ClusterOperatorDegraded
5865
annotations:
59-
message: Cluster operator {{ "{{ $labels.name }}" }} has been degraded for 10 mins. Operator is degraded because {{ "{{ $labels.reason }}" }} and cluster upgrades will be unstable.
66+
message: Cluster operator {{ "{{ $labels.name }}" }} has been degraded for 10 minutes. Operator is degraded because {{ "{{ $labels.reason }}" }} and cluster upgrades will be unstable.
6067
expr: |
6168
cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"} == 1
6269
for: 10m

pkg/cvo/availableupdates.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ import (
2020
"github.com/openshift/cluster-version-operator/pkg/cincinnati"
2121
)
2222

23+
const noChannel string = "NoChannel"
24+
2325
// syncAvailableUpdates attempts to retrieve the latest updates and update the status of the ClusterVersion
2426
// object. It will set the RetrievedUpdates condition. Updates are only checked if it has been more than
2527
// the minimumUpdateCheckInterval since the last check.
@@ -179,7 +181,7 @@ func calculateAvailableUpdatesStatus(clusterID string, proxyURL *url.URL, tlsCon
179181

180182
if len(channel) == 0 {
181183
return nil, configv1.ClusterOperatorStatusCondition{
182-
Type: configv1.RetrievedUpdates, Status: configv1.ConditionFalse, Reason: "NoChannel",
184+
Type: configv1.RetrievedUpdates, Status: configv1.ConditionFalse, Reason: noChannel,
183185
Message: "The update channel has not been configured.",
184186
}
185187
}

pkg/cvo/cvo_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ import (
2727
"k8s.io/apimachinery/pkg/util/diff"
2828
"k8s.io/apimachinery/pkg/watch"
2929
"k8s.io/client-go/discovery"
30-
"k8s.io/client-go/rest"
3130
kfake "k8s.io/client-go/kubernetes/fake"
31+
"k8s.io/client-go/rest"
3232
ktesting "k8s.io/client-go/testing"
3333
"k8s.io/client-go/util/workqueue"
3434
"k8s.io/klog"
@@ -2375,7 +2375,7 @@ func TestOperator_availableUpdatesSync(t *testing.T) {
23752375
Condition: configv1.ClusterOperatorStatusCondition{
23762376
Type: configv1.RetrievedUpdates,
23772377
Status: configv1.ConditionFalse,
2378-
Reason: "NoChannel",
2378+
Reason: noChannel,
23792379
Message: "The update channel has not been configured.",
23802380
},
23812381
},

pkg/cvo/metrics.go

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"k8s.io/apimachinery/pkg/labels"
1010
"k8s.io/apimachinery/pkg/util/sets"
1111
"k8s.io/client-go/tools/cache"
12+
"k8s.io/klog"
1213

1314
configv1 "github.com/openshift/api/config/v1"
1415
"github.com/openshift/cluster-version-operator/lib/resourcemerge"
@@ -28,12 +29,13 @@ type operatorMetrics struct {
2829

2930
conditionTransitions map[conditionKey]int
3031

31-
version *prometheus.GaugeVec
32-
availableUpdates *prometheus.GaugeVec
33-
clusterOperatorUp *prometheus.GaugeVec
34-
clusterOperatorConditions *prometheus.GaugeVec
35-
clusterOperatorConditionTransitions *prometheus.GaugeVec
36-
clusterInstaller *prometheus.GaugeVec
32+
version *prometheus.GaugeVec
33+
availableUpdates *prometheus.GaugeVec
34+
clusterOperatorUp *prometheus.GaugeVec
35+
clusterOperatorConditions *prometheus.GaugeVec
36+
clusterOperatorConditionTransitions *prometheus.GaugeVec
37+
clusterInstaller *prometheus.GaugeVec
38+
clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec
3739
}
3840

3941
func newOperatorMetrics(optr *Operator) *operatorMetrics {
@@ -83,6 +85,10 @@ version for 'cluster', or empty for 'initial'.
8385
Name: "cluster_installer",
8486
Help: "Reports info about the installation process and, if applicable, the install tool.",
8587
}, []string{"type", "version", "invoker"}),
88+
clusterVersionOperatorUpdateRetrievalTimestampSeconds: prometheus.NewGaugeVec(prometheus.GaugeOpts{
89+
Name: "cluster_version_operator_update_retrieval_timestamp_seconds",
90+
Help: "Reports when updates were last succesfully retrieved.",
91+
}, []string{"name"}),
8692
}
8793
}
8894

@@ -133,6 +139,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
133139
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
134140
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
135141
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
142+
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
136143
}
137144

138145
func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
@@ -297,6 +304,15 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
297304
g.Set(1.0)
298305
ch <- g
299306
}
307+
308+
// check ability to retrieve recommended updates
309+
if availableUpdates := m.optr.getAvailableUpdates(); availableUpdates != nil {
310+
g := m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("")
311+
g.Set(float64(availableUpdates.LastSyncOrConfigChange.Unix()))
312+
ch <- g
313+
} else {
314+
klog.Warningf("availableUpdates is nil")
315+
}
300316
}
301317

302318
func gaugeFromInstallConfigMap(cm *corev1.ConfigMap, gauge *prometheus.GaugeVec, installType string) prometheus.Gauge {

0 commit comments

Comments
 (0)