Skip to content

Commit 97c89dd

Browse files
avlitmanAviv Litman
andauthored
Add recording rules with new naming (#4026)
add recording rules with new naming that follow the names linter and best practices, and deprecate old names Signed-off-by: alitman <alitman@alitman-thinkpadp1gen7.raanaii.csb> Co-authored-by: Aviv Litman <alitman@alitman-thinkpadp1gen7.raanaii.csb>
1 parent eb55111 commit 97c89dd

File tree

3 files changed

+54
-36
lines changed

3 files changed

+54
-36
lines changed

docs/metrics.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
| kubevirt_hco_single_stack_ipv6 | Metric | Gauge | Indicates whether the underlying cluster is single stack IPv6 (1) or not (0) |
1212
| kubevirt_hco_system_health_status | Metric | Gauge | Indicates whether the system health status is healthy (0), warning (1), or error (2), by aggregating the conditions of HCO and its secondary resources |
1313
| kubevirt_hco_unsafe_modifications | Metric | Gauge | Count of unsafe modifications in the HyperConverged annotations |
14-
| cluster:vmi_request_cpu_cores:sum | Recording rule | Gauge | Sum of CPU core requests for all running virt-launcher VMIs across the entire Kubevirt cluster |
14+
| cluster:kubevirt_hco_operator_health_status:count | Recording rule | Gauge | Indicates whether HCO and its secondary resources health status is healthy (0), warning (1) or critical (2), based both on the firing alerts that impact the operator health, and on kubevirt_hco_system_health_status metric |
15+
| cluster:kubevirt_hco_vmi_request_cpu_cores:sum | Recording rule | Gauge | Sum of CPU core requests for all running virt-launcher VMIs across the entire KubeVirt cluster |
16+
| cluster:vmi_request_cpu_cores:sum | Recording rule | Gauge | [Deprecated] Sum of CPU core requests for all running virt-launcher VMIs across the entire KubeVirt cluster |
1517
| cnv_abnormal | Recording rule | Gauge | Monitors resources for potential problems |
16-
| kubevirt_hyperconverged_operator_health_status | Recording rule | Gauge | Indicates whether HCO and its secondary resources health status is healthy (0), warning (1) or critical (2), based both on the firing alerts that impact the operator health, and on kubevirt_hco_system_health_status metric |
18+
| kubevirt_hyperconverged_operator_health_status | Recording rule | Gauge | [Deprecated] Indicates whether HCO and its secondary resources health status is healthy (0), warning (1) or critical (2), based both on the firing alerts that impact the operator health, and on kubevirt_hco_system_health_status metric |
1719

1820
## Developing new metrics
1921

hack/prom-rule-ci/hyperconverged-prom-rules-tests.yaml

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -581,31 +581,31 @@ tests:
581581
# time: 0 1 2 3 4 5
582582
values: "stale stale stale 1 0 1"
583583
promql_expr_test:
584-
- expr: 'cluster:vmi_request_cpu_cores:sum'
584+
- expr: 'cluster:kubevirt_hco_vmi_request_cpu_cores:sum'
585585
eval_time: 1m
586586
exp_samples:
587-
- labels: 'cluster:vmi_request_cpu_cores:sum{}'
587+
- labels: 'cluster:kubevirt_hco_vmi_request_cpu_cores:sum{}'
588588
value: 2
589589
# update for new pods
590-
- expr: 'cluster:vmi_request_cpu_cores:sum'
590+
- expr: 'cluster:kubevirt_hco_vmi_request_cpu_cores:sum'
591591
eval_time: 3m
592592
exp_samples:
593-
- labels: 'cluster:vmi_request_cpu_cores:sum{}'
593+
- labels: 'cluster:kubevirt_hco_vmi_request_cpu_cores:sum{}'
594594
value: 3
595595
# virt-launcher-new is not running at 4m. must exclude it
596-
- expr: 'cluster:vmi_request_cpu_cores:sum'
596+
- expr: 'cluster:kubevirt_hco_vmi_request_cpu_cores:sum'
597597
eval_time: 4m
598598
exp_samples:
599-
- labels: 'cluster:vmi_request_cpu_cores:sum{}'
599+
- labels: 'cluster:kubevirt_hco_vmi_request_cpu_cores:sum{}'
600600
value: 2
601601
# virt-launcher-new is back at 5m. must include it
602-
- expr: 'cluster:vmi_request_cpu_cores:sum'
602+
- expr: 'cluster:kubevirt_hco_vmi_request_cpu_cores:sum'
603603
eval_time: 5m
604604
exp_samples:
605-
- labels: 'cluster:vmi_request_cpu_cores:sum{}'
605+
- labels: 'cluster:kubevirt_hco_vmi_request_cpu_cores:sum{}'
606606
value: 3
607607

608-
# Test kubevirt_hyperconverged_operator_health_status recording rule
608+
# Test cluster:kubevirt_hco_operator_health_status:count recording rule
609609
- interval: 1m
610610
input_series:
611611
- series: 'kubevirt_hco_system_health_status'
@@ -619,76 +619,76 @@ tests:
619619
values: "1 stale 1 stale 1 stale 1 stale 1 stale 1 stale"
620620
promql_expr_test:
621621
# kubevirt_hco_system_health_status = 0 and both warning and critical alerts are firing at 0m
622-
- expr: 'kubevirt_hyperconverged_operator_health_status'
622+
- expr: 'cluster:kubevirt_hco_operator_health_status:count'
623623
eval_time: 0m
624624
exp_samples:
625-
- labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}'
625+
- labels: 'cluster:kubevirt_hco_operator_health_status:count{name="kubevirt-hyperconverged"}'
626626
value: 2
627627
# kubevirt_hco_system_health_status = 0 and only a warning alert is firing at 1m
628-
- expr: 'kubevirt_hyperconverged_operator_health_status'
628+
- expr: 'cluster:kubevirt_hco_operator_health_status:count'
629629
eval_time: 1m
630630
exp_samples:
631-
- labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}'
631+
- labels: 'cluster:kubevirt_hco_operator_health_status:count{name="kubevirt-hyperconverged"}'
632632
value: 1
633633
# kubevirt_hco_system_health_status = 0 and a critical alert is firing at 2m
634-
- expr: 'kubevirt_hyperconverged_operator_health_status'
634+
- expr: 'cluster:kubevirt_hco_operator_health_status:count'
635635
eval_time: 2m
636636
exp_samples:
637-
- labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}'
637+
- labels: 'cluster:kubevirt_hco_operator_health_status:count{name="kubevirt-hyperconverged"}'
638638
value: 2
639639
# kubevirt_hco_system_health_status = 0 and no alerts are firing at 3m
640-
- expr: 'kubevirt_hyperconverged_operator_health_status'
640+
- expr: 'cluster:kubevirt_hco_operator_health_status:count'
641641
eval_time: 3m
642642
exp_samples:
643-
- labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}'
643+
- labels: 'cluster:kubevirt_hco_operator_health_status:count{name="kubevirt-hyperconverged"}'
644644
value: 0
645645
# kubevirt_hco_system_health_status = 1 and both warning and critical alerts are firing at 4m
646-
- expr: 'kubevirt_hyperconverged_operator_health_status'
646+
- expr: 'cluster:kubevirt_hco_operator_health_status:count'
647647
eval_time: 4m
648648
exp_samples:
649-
- labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}'
649+
- labels: 'cluster:kubevirt_hco_operator_health_status:count{name="kubevirt-hyperconverged"}'
650650
value: 2
651651
# kubevirt_hco_system_health_status = 1 and only a warning alert is firing at 5m
652-
- expr: 'kubevirt_hyperconverged_operator_health_status'
652+
- expr: 'cluster:kubevirt_hco_operator_health_status:count'
653653
eval_time: 5m
654654
exp_samples:
655-
- labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}'
655+
- labels: 'cluster:kubevirt_hco_operator_health_status:count{name="kubevirt-hyperconverged"}'
656656
value: 1
657657
# kubevirt_hco_system_health_status = 1 and a critical alert is firing at 6m
658-
- expr: 'kubevirt_hyperconverged_operator_health_status'
658+
- expr: 'cluster:kubevirt_hco_operator_health_status:count'
659659
eval_time: 6m
660660
exp_samples:
661-
- labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}'
661+
- labels: 'cluster:kubevirt_hco_operator_health_status:count{name="kubevirt-hyperconverged"}'
662662
value: 2
663663
# kubevirt_hco_system_health_status = 1 and no alerts are firing at 7m
664-
- expr: 'kubevirt_hyperconverged_operator_health_status'
664+
- expr: 'cluster:kubevirt_hco_operator_health_status:count'
665665
eval_time: 7m
666666
exp_samples:
667-
- labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}'
667+
- labels: 'cluster:kubevirt_hco_operator_health_status:count{name="kubevirt-hyperconverged"}'
668668
value: 1
669669
# kubevirt_hco_system_health_status = 2 and both warning and critical alerts are firing at 8m
670-
- expr: 'kubevirt_hyperconverged_operator_health_status'
670+
- expr: 'cluster:kubevirt_hco_operator_health_status:count'
671671
eval_time: 8m
672672
exp_samples:
673-
- labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}'
673+
- labels: 'cluster:kubevirt_hco_operator_health_status:count{name="kubevirt-hyperconverged"}'
674674
value: 2
675675
# kubevirt_hco_system_health_status = 2 and only a warning alert is firing at 9m
676-
- expr: 'kubevirt_hyperconverged_operator_health_status'
676+
- expr: 'cluster:kubevirt_hco_operator_health_status:count'
677677
eval_time: 9m
678678
exp_samples:
679-
- labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}'
679+
- labels: 'cluster:kubevirt_hco_operator_health_status:count{name="kubevirt-hyperconverged"}'
680680
value: 2
681681
# kubevirt_hco_system_health_status = 2 and a critical alert is firing at 10m
682-
- expr: 'kubevirt_hyperconverged_operator_health_status'
682+
- expr: 'cluster:kubevirt_hco_operator_health_status:count'
683683
eval_time: 10m
684684
exp_samples:
685-
- labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}'
685+
- labels: 'cluster:kubevirt_hco_operator_health_status:count{name="kubevirt-hyperconverged"}'
686686
value: 2
687687
# kubevirt_hco_system_health_status = 2 and no alerts are firing at 11m
688-
- expr: 'kubevirt_hyperconverged_operator_health_status'
688+
- expr: 'cluster:kubevirt_hco_operator_health_status:count'
689689
eval_time: 11m
690690
exp_samples:
691-
- labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}'
691+
- labels: 'cluster:kubevirt_hco_operator_health_status:count{name="kubevirt-hyperconverged"}'
692692
value: 2
693693

694694
# Test kubevirt_hco_misconfigured_descheduler

pkg/monitoring/hyperconverged/rules/recordingrules/operator.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@ var operatorRecordingRules = []operatorrules.RecordingRule{
2020
{
2121
MetricsOpts: operatormetrics.MetricOpts{
2222
Name: "kubevirt_hyperconverged_operator_health_status",
23+
Help: "[Deprecated] Indicates whether HCO and its secondary resources health status is healthy (0), warning (1) or critical (2), based both on the firing alerts that impact the operator health, and on kubevirt_hco_system_health_status metric",
24+
},
25+
MetricType: operatormetrics.GaugeType,
26+
Expr: buildOperatorHealthStatusExpr(),
27+
},
28+
{
29+
MetricsOpts: operatormetrics.MetricOpts{
30+
Name: "cluster:kubevirt_hco_operator_health_status:count",
2331
Help: "Indicates whether HCO and its secondary resources health status is healthy (0), warning (1) or critical (2), based both on the firing alerts that impact the operator health, and on kubevirt_hco_system_health_status metric",
2432
},
2533
MetricType: operatormetrics.GaugeType,
@@ -28,7 +36,15 @@ var operatorRecordingRules = []operatorrules.RecordingRule{
2836
{
2937
MetricsOpts: operatormetrics.MetricOpts{
3038
Name: "cluster:vmi_request_cpu_cores:sum",
31-
Help: "Sum of CPU core requests for all running virt-launcher VMIs across the entire Kubevirt cluster",
39+
Help: "[Deprecated] Sum of CPU core requests for all running virt-launcher VMIs across the entire KubeVirt cluster",
40+
},
41+
MetricType: operatormetrics.GaugeType,
42+
Expr: intstr.FromString(`sum(kube_pod_container_resource_requests{resource="cpu"} and on (pod) kube_pod_status_phase{phase="Running"} * on (pod) group_left kube_pod_labels{ label_kubevirt_io="virt-launcher"} > 0)`),
43+
},
44+
{
45+
MetricsOpts: operatormetrics.MetricOpts{
46+
Name: "cluster:kubevirt_hco_vmi_request_cpu_cores:sum",
47+
Help: "Sum of CPU core requests for all running virt-launcher VMIs across the entire KubeVirt cluster",
3248
},
3349
MetricType: operatormetrics.GaugeType,
3450
Expr: intstr.FromString(`sum(kube_pod_container_resource_requests{resource="cpu"} and on (pod) kube_pod_status_phase{phase="Running"} * on (pod) group_left kube_pod_labels{ label_kubevirt_io="virt-launcher"} > 0)`),

0 commit comments

Comments
 (0)