Skip to content

Commit 990bb10

Browse files
authored
Merge branch 'main' into main
2 parents 780d08a + cb368a2 commit 990bb10

File tree

10 files changed

+2043
-7
lines changed

10 files changed

+2043
-7
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ __pycache__/*
1717
kube-burner*
1818
kube_burner*
1919
recommender_*.json
20+
resiliency*.json
2021

2122
# Project files
2223
.ropeproject

config/config.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ kraken:
5555
- kubevirt_vm_outage:
5656
- scenarios/kubevirt/kubevirt-vm-outage.yaml
5757

58+
resiliency:
59+
resiliency_run_mode: standalone # Options: standalone, controller, disabled
60+
resiliency_file: config/alerts.yaml # Path to SLO definitions, will resolve to performance_monitoring: alert_profile: if not specified
61+
5862
cerberus:
5963
cerberus_enabled: False # Enable it when cerberus is previously installed
6064
cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
@@ -131,4 +135,5 @@ kubevirt_checks: # Utilizing virt che
131135
disconnected: False # Boolean of how to try to connect to the VMIs; if True will use the ip_address to try ssh from within a node, if false will use the name and uses virtctl to try to connect; Default is False
132136
ssh_node: "" # If set, will be a backup way to ssh to a node. Will want to set to a node that isn't targeted in chaos
133137
node_names: ""
134-
exit_on_failure: # If value is True and VMI's are failing post chaos returns failure, values can be True/False
138+
exit_on_failure: # If value is True and VMI's are failing post chaos returns failure, values can be True/False
139+

krkn/prometheus/collector.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
from __future__ import annotations
2+
3+
import datetime
4+
import logging
5+
from typing import Dict, Any, List, Optional
6+
7+
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
8+
9+
10+
# -----------------------------------------------------------------------------
11+
# SLO evaluation helpers (used by krkn.resiliency)
12+
# -----------------------------------------------------------------------------
13+
14+
15+
def slo_passed(prometheus_result: List[Any]) -> Optional[bool]:
16+
if not prometheus_result:
17+
return None
18+
has_samples = False
19+
for series in prometheus_result:
20+
if "values" in series:
21+
has_samples = True
22+
for _ts, val in series["values"]:
23+
try:
24+
if float(val) > 0:
25+
return False
26+
except (TypeError, ValueError):
27+
continue
28+
elif "value" in series:
29+
has_samples = True
30+
try:
31+
return float(series["value"][1]) == 0
32+
except (TypeError, ValueError):
33+
return False
34+
35+
# If we reached here and never saw any samples, skip
36+
return None if not has_samples else True
37+
38+
39+
def evaluate_slos(
40+
prom_cli: KrknPrometheus,
41+
slo_list: List[Dict[str, Any]],
42+
start_time: datetime.datetime,
43+
end_time: datetime.datetime,
44+
) -> Dict[str, bool]:
45+
"""Evaluate a list of SLO expressions against Prometheus.
46+
47+
Args:
48+
prom_cli: Configured Prometheus client.
49+
slo_list: List of dicts with keys ``name``, ``expr``.
50+
start_time: Start timestamp.
51+
end_time: End timestamp.
52+
granularity: Step in seconds for range queries.
53+
Returns:
54+
Mapping name -> bool indicating pass status.
55+
True means good we passed the SLO test otherwise failed the SLO
56+
"""
57+
results: Dict[str, bool] = {}
58+
logging.info("Evaluating %d SLOs over window %s – %s", len(slo_list), start_time, end_time)
59+
for slo in slo_list:
60+
expr = slo["expr"]
61+
name = slo["name"]
62+
try:
63+
response = prom_cli.process_prom_query_in_range(
64+
expr,
65+
start_time=start_time,
66+
end_time=end_time,
67+
)
68+
69+
passed = slo_passed(response)
70+
if passed is None:
71+
# Absence of data indicates the condition did not trigger; treat as pass.
72+
logging.debug("SLO '%s' query returned no data; assuming pass.", name)
73+
results[name] = True
74+
else:
75+
results[name] = passed
76+
except Exception as exc:
77+
logging.error("PromQL query failed for SLO '%s': %s", name, exc)
78+
results[name] = False
79+
return results

krkn/resiliency/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
"""krkn.resiliency package public interface."""
2+
3+
from .resiliency import Resiliency # noqa: F401
4+
from .score import calculate_resiliency_score # noqa: F401

0 commit comments

Comments
 (0)