Skip to content

Commit a04cd5f

Browse files
authored
KFLUXINFRA-1986: Update Kueue alerting rules (#7381)
- Include the rules in the platform Prometheus. When included in the UWM Prometheus a label with the namespace name (tekton-kueue) is added to the rules, which breaks them. - Don't use critical severity so the alerts won't show up for SRE platform (they are not interested in those alerts and shouldn't do anything with them). - Add an alert for the success rate of requests to the tekton-kueue mutating webhook servers. Signed-off-by: Gal Ben Haim <[email protected]>
1 parent 39e9bee commit a04cd5f

File tree

4 files changed

+64
-6
lines changed

4 files changed

+64
-6
lines changed

components/kueue/development/tekton-kueue-monitoring/kueue-prometheus-alerts.yaml

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ spec:
3232
) * 100 < 100
3333
for: 5m
3434
labels:
35-
severity: critical
35+
severity: warning
3636
component: kueue
3737
annotations:
3838
summary: "Kueue deployment {{ $labels.deployment }} has unavailable replicas"
@@ -115,7 +115,7 @@ spec:
115115
expr: max by (status) (kueue_cluster_queue_status{cluster_queue="cluster-pipeline-queue", status="active"}) != 1
116116
for: 2m
117117
labels:
118-
severity: critical
118+
severity: warning
119119
component: kueue
120120
annotations:
121121
summary: "Kueue cluster queue is not active"
@@ -196,7 +196,7 @@ spec:
196196
expr: up{job=~".*kueue.*"} == 0
197197
for: 5m
198198
labels:
199-
severity: critical
199+
severity: warning
200200
component: kueue
201201
annotations:
202202
summary: "Kueue metrics endpoint is down"
@@ -217,3 +217,24 @@ spec:
217217
runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/infra/queue.md?ref_type=heads
218218
alert_team_handle: <!subteam^S05Q1P4Q2TG>
219219
team: konflux-infra
220+
221+
- alert: KueueMutatingWebhookLowSuccessRate
222+
expr: |
223+
100 * sum(increase(apiserver_admission_webhook_request_total{
224+
name="pipelinerun-kueue-defaulter.tekton-kueue.io", code=~"2.."
225+
}[10m]))
226+
/
227+
sum(increase(apiserver_admission_webhook_request_total{
228+
name="pipelinerun-kueue-defaulter.tekton-kueue.io"
229+
}[10m]))
230+
< 99
231+
for: 10m
232+
labels:
233+
severity: warning
234+
component: kueue
235+
annotations:
236+
summary: "Kueue mutating webhook success rate is below 99%"
237+
description: "The mutating webhook 'pipelinerun-kueue-defaulter.tekton-kueue.io' has had a success rate below 99% over the past 10 minutes. Possible causes include webhook errors, rejections, or unreachability (e.g., code=600)."
238+
runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/infra/queue.md?ref_type=heads
239+
alert_team_handle: <!subteam^S05Q1P4Q2TG>
240+
team: konflux-infra

components/kueue/development/tekton-kueue/kustomization.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,12 @@ configMapGenerator:
1919
behavior: replace
2020
files:
2121
- config.yaml
22+
23+
patches:
24+
- target:
25+
kind: Namespace
26+
name: tekton-kueue
27+
patch: |-
28+
- op: add
29+
path: /metadata/labels/openshift.io~1cluster-monitoring
30+
value: "true"

components/kueue/staging/base/tekton-kueue-monitoring/kueue-prometheus-alerts.yaml

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ spec:
3232
) * 100 < 100
3333
for: 5m
3434
labels:
35-
severity: critical
35+
severity: warning
3636
component: kueue
3737
annotations:
3838
summary: "Kueue deployment {{ $labels.deployment }} has unavailable replicas"
@@ -115,7 +115,7 @@ spec:
115115
expr: max by (status) (kueue_cluster_queue_status{cluster_queue="cluster-pipeline-queue", status="active"}) != 1
116116
for: 2m
117117
labels:
118-
severity: critical
118+
severity: warning
119119
component: kueue
120120
annotations:
121121
summary: "Kueue cluster queue is not active"
@@ -196,7 +196,7 @@ spec:
196196
expr: up{job=~".*kueue.*"} == 0
197197
for: 5m
198198
labels:
199-
severity: critical
199+
severity: warning
200200
component: kueue
201201
annotations:
202202
summary: "Kueue metrics endpoint is down"
@@ -217,3 +217,24 @@ spec:
217217
runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/infra/queue.md?ref_type=heads
218218
alert_team_handle: <!subteam^S05Q1P4Q2TG>
219219
team: konflux-infra
220+
221+
- alert: KueueMutatingWebhookLowSuccessRate
222+
expr: |
223+
100 * sum(increase(apiserver_admission_webhook_request_total{
224+
name="pipelinerun-kueue-defaulter.tekton-kueue.io", code=~"2.."
225+
}[10m]))
226+
/
227+
sum(increase(apiserver_admission_webhook_request_total{
228+
name="pipelinerun-kueue-defaulter.tekton-kueue.io"
229+
}[10m]))
230+
< 99
231+
for: 10m
232+
labels:
233+
severity: warning
234+
component: kueue
235+
annotations:
236+
summary: "Kueue mutating webhook success rate is below 99%"
237+
description: "The mutating webhook 'pipelinerun-kueue-defaulter.tekton-kueue.io' has had a success rate below 99% over the past 10 minutes. Possible causes include webhook errors, rejections, or unreachability (e.g., code=600)."
238+
runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/infra/queue.md?ref_type=heads
239+
alert_team_handle: <!subteam^S05Q1P4Q2TG>
240+
team: konflux-infra

components/kueue/staging/base/tekton-kueue/kustomization.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,10 @@ patches:
3333
kind: Deployment
3434
name: tekton-kueue-controller-manager
3535
version: v1
36+
- target:
37+
kind: Namespace
38+
name: tekton-kueue
39+
patch: |-
40+
- op: add
41+
path: /metadata/labels/openshift.io~1cluster-monitoring
42+
value: "true"

0 commit comments

Comments
 (0)