Skip to content

Commit bd6300d

Browse files
committed
Improve alerts logic
1 parent de60592 commit bd6300d

File tree

3 files changed

+43
-39
lines changed

3 files changed

+43
-39
lines changed

src/gort/etc/lvmgort.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@ overwatcher:
321321
stop_secs_before_morning: 600
322322

323323
alerts:
324+
max_time_without_alerts_data: 300
324325
idle_timeout: 600
325326
main_task_stalled: 600
326327

src/gort/overwatcher/alerts.py

Lines changed: 41 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from lvmopstools.utils import Trigger
2121

2222
from gort.overwatcher.core import OverwatcherModule, OverwatcherModuleTask
23-
from gort.tools import decap, get_lvmapi_route
23+
from gort.tools import decap, ensure_period, get_lvmapi_route
2424

2525

2626
if TYPE_CHECKING:
@@ -105,52 +105,37 @@ class AlertsMonitorTask(OverwatcherModuleTask["AlertsOverwatcher"]):
105105
restart_on_error = True
106106

107107
INTERVAL: float = 30
108-
N_FAILURES: int = 5
109108

110109
async def task(self):
111110
"""Updates the alerts data."""
112111

113-
n_failures: int = 0
114-
last_error: str = "Undefined error"
115-
116-
slack_channels = self.config["overwatcher.slack.notifications_channels"]
112+
SLACK_CHANNELS = self.config["overwatcher.slack.notifications_channels"]
113+
MAX_TIME = self.config["overwatcher.alerts.max_time_without_alerts_data"]
117114

118115
while True:
119116
try:
120117
await self.update_alerts()
121118

122119
except Exception as err:
123120
self.log.error(f"Failed retriving alerts data: {decap(err)}")
124-
n_failures += 1
125-
last_error = str(err)
126121

127-
else:
128-
self.module.last_updated = time()
122+
if (
123+
self.module.alerts_data_unavailable is False
124+
and time() - self.module.last_updated > MAX_TIME
125+
):
126+
await self.module.notify_unavailable(err)
129127

128+
else:
130129
# Send a resolution message if needed.
131130
if self.module.alerts_data_unavailable:
132131
await self.module.notify(
133132
"@here [RESOLVED]: Alerts data is now available.",
134133
level="info",
135-
slack_channels=[*slack_channels, "lvm-alerts"],
134+
slack_channels=[*SLACK_CHANNELS, "lvm-alerts"],
136135
)
137136

138137
self.module.alerts_data_unavailable = False
139-
140-
n_failures = 0
141-
last_error = "Undefined error"
142-
143-
finally:
144-
if (
145-
self.module.alerts_data_unavailable is False
146-
and n_failures >= self.N_FAILURES
147-
):
148-
await self.module.notify(
149-
"Failed to retrieve alerts data multiple times: "
150-
f"{decap(last_error, add_period=True)}",
151-
level="critical",
152-
)
153-
self.module.alerts_data_unavailable = True
138+
self.module.last_updated = time()
154139

155140
await asyncio.sleep(self.INTERVAL)
156141

@@ -181,9 +166,14 @@ def __init__(self, *args, **kwargs):
181166
self.state: AlertsSummary | None = None
182167
self.connectivity = ConnectivityStatus()
183168

169+
# Overwatcher idle since timestamp.
170+
self.overwatcher_idle_since: float = 0.0
171+
172+
# Last valid updated. Must include a valid response
173+
# from the API and weather alerts data.
184174
self.last_updated: float = 0.0
185-
self.idle_since: float = 0.0
186175

176+
# Alerts data has been unavailable for a while.
187177
self.alerts_data_unavailable: bool = False
188178

189179
async def is_safe(self) -> tuple[bool, ActiveAlert]:
@@ -197,21 +187,23 @@ async def is_safe(self) -> tuple[bool, ActiveAlert]:
197187
active_alerts = ActiveAlert(0)
198188

199189
# Keep track of how long the overwatcher has been idle.
200-
if self.overwatcher.state.idle and self.idle_since == 0:
201-
self.idle_since = time()
202-
else:
203-
self.idle_since = 0
190+
if self.overwatcher.state.idle and self.overwaidle_since == 0:
191+
self.overwaidle_since = time()
192+
elif not self.overwatcher.state.idle:
193+
self.overwaidle_since = 0
204194

205195
if self.alerts_data_unavailable:
206196
self.log.warning("Alerts data is unavailable.")
207197
active_alerts |= ActiveAlert.ALERTS_DATA_UNAVAILABLE
208198
return False, active_alerts
209199

210-
if self.alerts_data_unavailable is False and time() - self.last_updated > 300:
200+
if self.alerts_data_unavailable is False and time() - self.last_updated > 600:
211201
# If the data is not unavailable but it has not been updated
212-
# in the last 5 minutes, something is wrong. We mark it as unavailable.
213-
self.log.warning("Alerts data has not been updated in the last 5 minutes.")
214-
self.alerts_data_unavailable = True
202+
# in the last 10 minutes, something is wrong. We mark it as unavailable.
203+
# This is a redundancy since the AlertsMonitorTask should have already
204+
# marked it as unavailable before that.
205+
self.log.warning("Alerts data has not been updated in the last 10 minutes.")
206+
await self.notify_unavailable()
215207
active_alerts |= ActiveAlert.ALERTS_DATA_UNAVAILABLE
216208
return False, active_alerts
217209

@@ -273,7 +265,7 @@ async def is_safe(self) -> tuple[bool, ActiveAlert]:
273265
# If it's safe to observe but we have been idle for a while, we
274266
# raise an alert but do not change the is_safe status.
275267
timeout = self.overwatcher.config["overwatcher.alerts.idle_timeout"] or 600
276-
if self.idle_since > 0 and (time() - self.idle_since) > timeout:
268+
if self.overwaidle_since > 0 and (time() - self.overwaidle_since) > timeout:
277269
await self.notify(
278270
f"Overwatcher has been idle for over {timeout:.0f} s.",
279271
min_time_between_repeat_notifications=300,
@@ -288,7 +280,18 @@ async def is_safe(self) -> tuple[bool, ActiveAlert]:
288280

289281
return is_safe, active_alerts
290282

291-
async def update_status(self) -> AlertsSummary:
283+
async def notify_unavailable(self, err: Exception | str | None = None):
284+
"""Sends a notification that alerts data is unavailable."""
285+
286+
msg = "Failed to retrieve alerts data multiple times"
287+
if err is not None and str(err) != "":
288+
msg += f": {decap(str(err))}"
289+
290+
await self.notify(ensure_period(msg), level="critical")
291+
292+
self.alerts_data_unavailable = True
293+
294+
async def update_status(self) -> AlertsSummary | None:
292295
"""Returns the alerts report."""
293296

294297
try:
@@ -322,4 +325,4 @@ async def update_status(self) -> AlertsSummary:
322325
else:
323326
self.connectivity.internet.reset()
324327

325-
return summary
328+
return self.state

src/gort/overwatcher/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def __init__(self, log: LogNamespace | None = None):
3838

3939
self._log: Mock | LogNamespace = log or Mock()
4040

41-
async def run(self):
41+
async def run(self, *args, **kwargs):
4242
"""Runs the task."""
4343

4444
self._task_runner = asyncio.create_task(self.task())

0 commit comments

Comments
 (0)