Skip to content

Commit 2ddb4a2

Browse files
Merge pull request #357 from jottofar/ota-151-extend-alerts
Extend CVO alerts to cover update retrieval
2 parents cca8239 + 1005398 commit 2ddb4a2

File tree

5 files changed

+36
-10
lines changed

5 files changed

+36
-10
lines changed

docs/user/status.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Under some conditions during installs and updates, the CVO will treat this condi
2929
When `RetrievedUpdates` is `True`, the CVO is succesfully retrieving updates, which is good.
3030
When `RetrievedUpdates` is `False`, `reason` will be set to explain why, as discussed in the following subsections.
3131
In all cases, the impact is that the cluster will not be able to retrieve recommended updates, so cluster admins will need to monitor for available updates on their own or risk falling behind on security or other bugfixes.
32+
When CVO is unable to retrieve recommended updates the CannotRetrieveUpdates alert will fire containing the reason. This alert will not fire when the reason updates cannot be retrieved is NoChannel.
3233

3334
### NoUpstream
3435

install/0000_90_cluster-version-operator_02_servicemonitor.yaml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,19 +44,26 @@ spec:
4444
for: 10m
4545
labels:
4646
severity: critical
47+
- alert: CannotRetrieveUpdates
48+
annotations:
49+
message: Cluster version operator has not retrieved updates in {{ "{{ $value | humanizeDuration }}" }}. Failure reason {{ "{{ with $cluster_operator_conditions := \"cluster_operator_conditions\" | query}}{{range $value := .}}{{if and (eq (label \"name\" $value) \"version\") (eq (label \"condition\" $value) \"RetrievedUpdates\") (eq (label \"endpoint\" $value) \"metrics\") (eq (value $value) 0.0)}}{{label \"reason\" $value}} {{end}}{{end}}{{end}}" }}. {{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} For more information refer to {{ label \"url\" (first $console_url ) }}/settings/cluster/.{{ end }}{{ end }}" }}
50+
expr: |
51+
(time()-cluster_version_operator_update_retrieval_timestamp_seconds) >= 3600 and ignoring(condition, name, reason) cluster_operator_conditions{name="version", condition="RetrievedUpdates", endpoint="metrics", reason!="NoChannel"}
52+
labels:
53+
severity: critical
4754
- name: cluster-operators
4855
rules:
4956
- alert: ClusterOperatorDown
5057
annotations:
51-
message: Cluster operator {{ "{{ $labels.name }}" }} has not been available for 10 mins. Operator may be down or disabled, cluster will not be kept up to date and upgrades will not be possible.
58+
message: Cluster operator {{ "{{ $labels.name }}" }} has not been available for 10 minutes. Operator may be down or disabled, cluster will not be kept up to date and upgrades will not be possible.
5259
expr: |
5360
cluster_operator_up{job="cluster-version-operator"} == 0
5461
for: 10m
5562
labels:
5663
severity: critical
5764
- alert: ClusterOperatorDegraded
5865
annotations:
59-
message: Cluster operator {{ "{{ $labels.name }}" }} has been degraded for 10 mins. Operator is degraded because {{ "{{ $labels.reason }}" }} and cluster upgrades will be unstable.
66+
message: Cluster operator {{ "{{ $labels.name }}" }} has been degraded for 10 minutes. Operator is degraded because {{ "{{ $labels.reason }}" }} and cluster upgrades will be unstable.
6067
expr: |
6168
cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"} == 1
6269
for: 10m

pkg/cvo/availableupdates.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ import (
2020
"github.com/openshift/cluster-version-operator/pkg/cincinnati"
2121
)
2222

23+
const noChannel string = "NoChannel"
24+
2325
// syncAvailableUpdates attempts to retrieve the latest updates and update the status of the ClusterVersion
2426
// object. It will set the RetrievedUpdates condition. Updates are only checked if it has been more than
2527
// the minimumUpdateCheckInterval since the last check.
@@ -179,7 +181,7 @@ func calculateAvailableUpdatesStatus(clusterID string, proxyURL *url.URL, tlsCon
179181

180182
if len(channel) == 0 {
181183
return nil, configv1.ClusterOperatorStatusCondition{
182-
Type: configv1.RetrievedUpdates, Status: configv1.ConditionFalse, Reason: "NoChannel",
184+
Type: configv1.RetrievedUpdates, Status: configv1.ConditionFalse, Reason: noChannel,
183185
Message: "The update channel has not been configured.",
184186
}
185187
}

pkg/cvo/cvo_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2375,7 +2375,7 @@ func TestOperator_availableUpdatesSync(t *testing.T) {
23752375
Condition: configv1.ClusterOperatorStatusCondition{
23762376
Type: configv1.RetrievedUpdates,
23772377
Status: configv1.ConditionFalse,
2378-
Reason: "NoChannel",
2378+
Reason: noChannel,
23792379
Message: "The update channel has not been configured.",
23802380
},
23812381
},

pkg/cvo/metrics.go

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"k8s.io/apimachinery/pkg/labels"
1010
"k8s.io/apimachinery/pkg/util/sets"
1111
"k8s.io/client-go/tools/cache"
12+
"k8s.io/klog"
1213

1314
configv1 "github.com/openshift/api/config/v1"
1415
"github.com/openshift/cluster-version-operator/lib/resourcemerge"
@@ -28,12 +29,13 @@ type operatorMetrics struct {
2829

2930
conditionTransitions map[conditionKey]int
3031

31-
version *prometheus.GaugeVec
32-
availableUpdates *prometheus.GaugeVec
33-
clusterOperatorUp *prometheus.GaugeVec
34-
clusterOperatorConditions *prometheus.GaugeVec
35-
clusterOperatorConditionTransitions *prometheus.GaugeVec
36-
clusterInstaller *prometheus.GaugeVec
32+
version *prometheus.GaugeVec
33+
availableUpdates *prometheus.GaugeVec
34+
clusterOperatorUp *prometheus.GaugeVec
35+
clusterOperatorConditions *prometheus.GaugeVec
36+
clusterOperatorConditionTransitions *prometheus.GaugeVec
37+
clusterInstaller *prometheus.GaugeVec
38+
clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec
3739
}
3840

3941
func newOperatorMetrics(optr *Operator) *operatorMetrics {
@@ -83,6 +85,10 @@ version for 'cluster', or empty for 'initial'.
8385
Name: "cluster_installer",
8486
Help: "Reports info about the installation process and, if applicable, the install tool.",
8587
}, []string{"type", "version", "invoker"}),
88+
clusterVersionOperatorUpdateRetrievalTimestampSeconds: prometheus.NewGaugeVec(prometheus.GaugeOpts{
89+
Name: "cluster_version_operator_update_retrieval_timestamp_seconds",
90+
Help: "Reports when updates were last succesfully retrieved.",
91+
}, []string{"name"}),
8692
}
8793
}
8894

@@ -133,6 +139,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
133139
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
134140
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
135141
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
142+
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
136143
}
137144

138145
func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
@@ -297,6 +304,15 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
297304
g.Set(1.0)
298305
ch <- g
299306
}
307+
308+
// check ability to retrieve recommended updates
309+
if availableUpdates := m.optr.getAvailableUpdates(); availableUpdates != nil {
310+
g := m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("")
311+
g.Set(float64(availableUpdates.LastSyncOrConfigChange.Unix()))
312+
ch <- g
313+
} else {
314+
klog.Warningf("availableUpdates is nil")
315+
}
300316
}
301317

302318
func gaugeFromInstallConfigMap(cm *corev1.ConfigMap, gauge *prometheus.GaugeVec, installType string) prometheus.Gauge {

0 commit comments

Comments
 (0)