2020from lvmopstools .utils import Trigger
2121
2222from gort .overwatcher .core import OverwatcherModule , OverwatcherModuleTask
23- from gort .tools import decap , get_lvmapi_route
23+ from gort .tools import decap , ensure_period , get_lvmapi_route
2424
2525
2626if TYPE_CHECKING :
@@ -105,52 +105,37 @@ class AlertsMonitorTask(OverwatcherModuleTask["AlertsOverwatcher"]):
105105 restart_on_error = True
106106
107107 INTERVAL : float = 30
108- N_FAILURES : int = 5
109108
110109 async def task (self ):
111110 """Updates the alerts data."""
112111
113- n_failures : int = 0
114- last_error : str = "Undefined error"
115-
116- slack_channels = self .config ["overwatcher.slack.notifications_channels" ]
112+ SLACK_CHANNELS = self .config ["overwatcher.slack.notifications_channels" ]
113+ MAX_TIME = self .config ["overwatcher.alerts.max_time_without_alerts_data" ]
117114
118115 while True :
119116 try :
120117 await self .update_alerts ()
121118
122119 except Exception as err :
123120 self .log .error (f"Failed retriving alerts data: { decap (err )} " )
124- n_failures += 1
125- last_error = str (err )
126121
127- else :
128- self .module .last_updated = time ()
122+ if (
123+ self .module .alerts_data_unavailable is False
124+ and time () - self .module .last_updated > MAX_TIME
125+ ):
126+ await self .module .notify_unavailable (err )
129127
128+ else :
130129 # Send a resolution message if needed.
131130 if self .module .alerts_data_unavailable :
132131 await self .module .notify (
133132 "@here [RESOLVED]: Alerts data is now available." ,
134133 level = "info" ,
135- slack_channels = [* slack_channels , "lvm-alerts" ],
134+ slack_channels = [* SLACK_CHANNELS , "lvm-alerts" ],
136135 )
137136
138137 self .module .alerts_data_unavailable = False
139-
140- n_failures = 0
141- last_error = "Undefined error"
142-
143- finally :
144- if (
145- self .module .alerts_data_unavailable is False
146- and n_failures >= self .N_FAILURES
147- ):
148- await self .module .notify (
149- "Failed to retrieve alerts data multiple times: "
150- f"{ decap (last_error , add_period = True )} " ,
151- level = "critical" ,
152- )
153- self .module .alerts_data_unavailable = True
138+ self .module .last_updated = time ()
154139
155140 await asyncio .sleep (self .INTERVAL )
156141
@@ -181,9 +166,14 @@ def __init__(self, *args, **kwargs):
181166 self .state : AlertsSummary | None = None
182167 self .connectivity = ConnectivityStatus ()
183168
169+ # Overwatcher idle since timestamp.
170+ self .overwatcher_idle_since : float = 0.0
171+
172+ # Last valid updated. Must include a valid response
173+ # from the API and weather alerts data.
184174 self .last_updated : float = 0.0
185- self .idle_since : float = 0.0
186175
176+ # Alerts data has been unavailable for a while.
187177 self .alerts_data_unavailable : bool = False
188178
189179 async def is_safe (self ) -> tuple [bool , ActiveAlert ]:
@@ -197,21 +187,23 @@ async def is_safe(self) -> tuple[bool, ActiveAlert]:
197187 active_alerts = ActiveAlert (0 )
198188
199189 # Keep track of how long the overwatcher has been idle.
200- if self .overwatcher .state .idle and self .idle_since == 0 :
201- self .idle_since = time ()
202- else :
203- self .idle_since = 0
190+ if self .overwatcher .state .idle and self .overwaidle_since == 0 :
191+ self .overwaidle_since = time ()
192+ elif not self . overwatcher . state . idle :
193+ self .overwaidle_since = 0
204194
205195 if self .alerts_data_unavailable :
206196 self .log .warning ("Alerts data is unavailable." )
207197 active_alerts |= ActiveAlert .ALERTS_DATA_UNAVAILABLE
208198 return False , active_alerts
209199
210- if self .alerts_data_unavailable is False and time () - self .last_updated > 300 :
200+ if self .alerts_data_unavailable is False and time () - self .last_updated > 600 :
211201 # If the data is not unavailable but it has not been updated
212- # in the last 5 minutes, something is wrong. We mark it as unavailable.
213- self .log .warning ("Alerts data has not been updated in the last 5 minutes." )
214- self .alerts_data_unavailable = True
202+ # in the last 10 minutes, something is wrong. We mark it as unavailable.
203+ # This is a redundancy since the AlertsMonitorTask should have already
204+ # marked it as unavailable before that.
205+ self .log .warning ("Alerts data has not been updated in the last 10 minutes." )
206+ await self .notify_unavailable ()
215207 active_alerts |= ActiveAlert .ALERTS_DATA_UNAVAILABLE
216208 return False , active_alerts
217209
@@ -273,7 +265,7 @@ async def is_safe(self) -> tuple[bool, ActiveAlert]:
273265 # If it's safe to observe but we have been idle for a while, we
274266 # raise an alert but do not change the is_safe status.
275267 timeout = self .overwatcher .config ["overwatcher.alerts.idle_timeout" ] or 600
276- if self .idle_since > 0 and (time () - self .idle_since ) > timeout :
268+ if self .overwaidle_since > 0 and (time () - self .overwaidle_since ) > timeout :
277269 await self .notify (
278270 f"Overwatcher has been idle for over { timeout :.0f} s." ,
279271 min_time_between_repeat_notifications = 300 ,
@@ -288,7 +280,18 @@ async def is_safe(self) -> tuple[bool, ActiveAlert]:
288280
289281 return is_safe , active_alerts
290282
291- async def update_status (self ) -> AlertsSummary :
283+ async def notify_unavailable (self , err : Exception | str | None = None ):
284+ """Sends a notification that alerts data is unavailable."""
285+
286+ msg = "Failed to retrieve alerts data multiple times"
287+ if err is not None and str (err ) != "" :
288+ msg += f": { decap (str (err ))} "
289+
290+ await self .notify (ensure_period (msg ), level = "critical" )
291+
292+ self .alerts_data_unavailable = True
293+
294+ async def update_status (self ) -> AlertsSummary | None :
292295 """Returns the alerts report."""
293296
294297 try :
@@ -322,4 +325,4 @@ async def update_status(self) -> AlertsSummary:
322325 else :
323326 self .connectivity .internet .reset ()
324327
325- return summary
328+ return self . state
0 commit comments