Skip to content

Commit 6ca1982

Browse files
committed
implement OK alerts
1 parent 6275ec4 commit 6ca1982

File tree

1 file changed

+109
-8
lines changed

1 file changed

+109
-8
lines changed

modules/alert_bot.py

Lines changed: 109 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@ def init_alerts():
3434
"Validator's wallet <code>{wallet}</code> balance is less than 10 TON: {balance} TON.",
3535
18 * HOUR
3636
),
37+
"low_wallet_balance_ok": Alert(
38+
"info",
39+
"Validator's wallet balance is back to normal",
40+
"Validator's wallet <code>{wallet}</code> balance is {balance} TON.",
41+
0
42+
),
3743
"db_usage_80": Alert(
3844
"high",
3945
"Node's db usage is more than 80%",
@@ -50,6 +56,12 @@ def init_alerts():
5056
or (and) set node\'s archive ttl to lower value.""",
5157
6 * HOUR
5258
),
59+
"db_usage_ok": Alert(
60+
"info",
61+
"Node's db usage is back to normal",
62+
"TON DB usage is back to normal: <b>{usage}%</b>.",
63+
0
64+
),
5365
"low_efficiency": Alert(
5466
"high",
5567
"Validator had efficiency less than 90% in the validation round",
@@ -62,24 +74,48 @@ def init_alerts():
6274
"Node is out of sync on more than 20 sec: <b>{sync} sec</b>.",
6375
300
6476
),
77+
"sync_ok": Alert(
78+
"info",
79+
"Node is back in sync",
80+
"Node is back in sync: <b>{sync} sec</b>.",
81+
0
82+
),
6583
"service_down": Alert(
6684
"critical",
67-
"Node is not running (service is down)",
68-
"validator.service is down.",
85+
"Node is not running",
86+
"Node is not running. Probably daemon service is down.",
6987
300
7088
),
89+
"service_down_ok": Alert(
90+
"info",
91+
"Node is recovered",
92+
"Node is running.",
93+
0
94+
),
7195
"adnl_connection_failed": Alert(
7296
"high",
7397
"Node is not answering to ADNL connection",
74-
"ADNL connection to node failed",
98+
"ADNL connection to node failed.",
7599
3 * HOUR
76100
),
101+
"adnl_connection_ok": Alert(
102+
"info",
103+
"ADNL connection restored",
104+
"ADNL connection to node is OK.",
105+
0
106+
),
77107
"zero_block_created": Alert(
78108
"critical",
79109
"Validator has not created any blocks in the last few hours",
80110
"Validator has not created any blocks in the last <b>{hours} hours</b>.",
81111
int(VALIDATION_PERIOD / 2.3)
82112
),
113+
"zero_block_created_ok": Alert(
114+
"info",
115+
"Validator resumed block production",
116+
"Validator resumed block production. Blocks produced in the recent window: <b>{blocks}</b> in ~<b>{hours}h</b>.",
117+
0
118+
),
83119
"validator_slashed": Alert(
84120
"high",
85121
"Validator has been slashed in the previous validation round",
@@ -116,6 +152,12 @@ def init_alerts():
116152
"Found proposals with hashes `{hashes}` that have significant amount of votes, but current validator didn't vote for them. Please check @tonstatus for more details.",
117153
VALIDATION_PERIOD
118154
),
155+
"voting_ok": Alert(
156+
"info",
157+
"All high-priority proposals are voted",
158+
"All high-priority proposals are voted or no longer require action.",
159+
0
160+
),
119161
"initial_sync_completed": Alert(
120162
"info",
121163
"Initial sync has been completed (info alert with no sound)",
@@ -128,6 +170,12 @@ def init_alerts():
128170
"All collators for shards <code>{shards}</code> are offline.",
129171
3600
130172
),
173+
"shard_collators_ok": Alert(
174+
"info",
175+
"Shards have online collators again",
176+
"All required shards have online collators again.",
177+
0
178+
),
131179
}
132180

133181

@@ -162,7 +210,7 @@ def send_message(self, text: str, silent: bool = False, disable_web_page_preview
162210
if not response['ok']:
163211
raise Exception(f"send_message error: {response}")
164212

165-
def send_alert(self, alert_name: str, *args, **kwargs):
213+
def send_alert(self, alert_name: str, *args, track_active: bool = True, **kwargs):
166214
if not self.alert_is_enabled(alert_name):
167215
return
168216
last_sent = self.get_alert_sent(alert_name)
@@ -175,8 +223,10 @@ def send_alert(self, alert_name: str, *args, **kwargs):
175223
for key, value in kwargs.items():
176224
if isinstance(value, (int, float)):
177225
kwargs[key] = f'{value:,}'.replace(',', ' ') # make space separator for thousands
178-
179-
text = '🆘' if alert.severity != 'info' else ''
226+
if alert_name.endswith('_ok'):
227+
text = '✅'
228+
else:
229+
text = '🆘' if alert.severity != 'info' else ''
180230
text += f''' <b>Node {self.hostname}: {alert_name_readable} </b>
181231
182232
{alert.text.format(*args, **kwargs)}
@@ -196,6 +246,30 @@ def send_alert(self, alert_name: str, *args, **kwargs):
196246
if time.time() - last_sent > alert.timeout:
197247
self.send_message(text, alert.severity == "info") # send info alerts without sound
198248
self.set_alert_sent(alert_name)
249+
if track_active:
250+
self._set_alert_active(alert_name, True)
251+
252+
def resolve_alert(self, alert_name: str, ok_alert_name: str = None, **kwargs):
253+
if not self._is_alert_active(alert_name):
254+
return
255+
ok_alert_name = ok_alert_name or f"{alert_name}_ok"
256+
if ok_alert_name not in ALERTS:
257+
self._set_alert_active(alert_name, False)
258+
return
259+
if not self.alert_is_enabled(ok_alert_name):
260+
self._set_alert_active(alert_name, False)
261+
return
262+
self.send_alert(ok_alert_name, track_active=False, **kwargs)
263+
self._set_alert_active(alert_name, False)
264+
265+
def resolve_alert_group(self, alert_names: list, ok_alert_name: str, **kwargs):
266+
if not any(self._is_alert_active(name) for name in alert_names):
267+
return
268+
if ok_alert_name in ALERTS and self.alert_is_enabled(ok_alert_name):
269+
self.send_alert(ok_alert_name, track_active=False, **kwargs)
270+
for name in alert_names:
271+
if self._is_alert_active(name):
272+
self._set_alert_active(name, False)
199273

200274
def set_global_vars(self):
201275
# set global vars for correct alerts timeouts for current network
@@ -228,7 +302,7 @@ def get_alert_from_db(self, alert_name: str):
228302
if 'alerts' not in self.ton.local.db:
229303
self.ton.local.db['alerts'] = {}
230304
if alert_name not in self.ton.local.db['alerts']:
231-
self.ton.local.db['alerts'][alert_name] = {'sent': 0, 'enabled': True}
305+
self.ton.local.db['alerts'][alert_name] = {'sent': 0, 'enabled': True, 'active': False, 'resolved_sent': 0}
232306
return self.ton.local.db['alerts'][alert_name]
233307

234308
def set_alert_sent(self, alert_name: str):
@@ -248,6 +322,16 @@ def set_alert_enabled(self, alert_name: str, enabled: bool):
248322
alert['enabled'] = enabled
249323
self.ton.local.save()
250324

325+
def _set_alert_active(self, alert_name: str, active: bool):
326+
alert = self.get_alert_from_db(alert_name)
327+
if alert.get('active') != active:
328+
alert['active'] = active
329+
if not active:
330+
alert['resolved_sent'] = int(time.time())
331+
332+
def _is_alert_active(self, alert_name: str) -> bool:
333+
return self.get_alert_from_db(alert_name).get('active', False)
334+
251335
def enable_alert(self, args):
252336
if len(args) != 1:
253337
raise Exception("Usage: enable_alert <alert_name>")
@@ -320,6 +404,8 @@ def check_db_usage(self):
320404
self.send_alert("db_usage_95")
321405
elif usage > 80:
322406
self.send_alert("db_usage_80")
407+
else:
408+
self.resolve_alert_group(["db_usage_95", "db_usage_80"], "db_usage_ok", usage=int(usage))
323409

324410
def check_validator_wallet_balance(self):
325411
if not self.ton.using_validator():
@@ -331,6 +417,8 @@ def check_validator_wallet_balance(self):
331417
validator_account = self.ton.GetAccount(validator_wallet.addrB64)
332418
if validator_account.status != "empty" and validator_account.balance < 10:
333419
self.send_alert("low_wallet_balance", wallet=validator_wallet.addrB64, balance=validator_account.balance)
420+
else:
421+
self.resolve_alert("low_wallet_balance", ok_alert_name="low_wallet_balance_ok", wallet=validator_wallet.addrB64, balance=validator_account.balance)
334422

335423
def check_efficiency(self):
336424
if not self.ton.using_validator():
@@ -351,11 +439,15 @@ def check_validator_working(self):
351439
validator_status = self.ton.GetValidatorStatus()
352440
if not self.initial_sync and not validator_status.is_working:
353441
self.send_alert("service_down")
442+
elif not self.initial_sync and validator_status.is_working:
443+
self.resolve_alert("service_down", ok_alert_name="service_down_ok")
354444

355445
def check_sync(self):
356446
validator_status = self.ton.GetValidatorStatus()
357447
if not self.initial_sync and validator_status.is_working and validator_status.out_of_sync >= 20:
358448
self.send_alert("out_of_sync", sync=validator_status.out_of_sync)
449+
elif not self.initial_sync and validator_status.is_working and validator_status.out_of_sync < 20:
450+
self.resolve_alert("out_of_sync", ok_alert_name="sync_ok", sync=validator_status.out_of_sync)
359451

360452
def check_zero_blocks_created(self):
361453
if not self.ton.using_validator():
@@ -368,7 +460,10 @@ def check_zero_blocks_created(self):
368460
return
369461
validators = self.ton.GetValidatorsList(start=start, end=end)
370462
validator = self.validator_module.find_myself(validators)
371-
if validator is None or validator.blocks_created > 0:
463+
if validator is None:
464+
return
465+
if validator.blocks_created > 0:
466+
self.resolve_alert("zero_block_created", ok_alert_name="zero_block_created_ok", blocks=validator.blocks_created, hours=round(period / 3600))
372467
return
373468
self.send_alert("zero_block_created", hours=round(period / 3600))
374469

@@ -386,6 +481,8 @@ def check_adnl_connection_failed(self):
386481
if not ok:
387482
self.local.add_log(error, "warning")
388483
self.send_alert("adnl_connection_failed")
484+
else:
485+
self.resolve_alert("adnl_connection_failed", ok_alert_name="adnl_connection_ok")
389486

390487
def get_myself_from_election(self, config: dict):
391488
if not config["validators"]:
@@ -450,6 +547,8 @@ def check_voting(self):
450547
need_to_vote.append(offer['hash'])
451548
if need_to_vote:
452549
self.send_alert("voting", hashes=' '.join(need_to_vote))
550+
else:
551+
self.resolve_alert("voting", ok_alert_name="voting_ok")
453552

454553
def check_initial_sync(self):
455554
if not self.initial_sync:
@@ -478,6 +577,8 @@ def check_online_collators(self):
478577

479578
if offline_shards:
480579
self.send_alert("shard_collators_offline", shards=' '.join(offline_shards))
580+
else:
581+
self.resolve_alert("shard_collators_offline", ok_alert_name="shard_collators_ok")
481582

482583
def check_status(self):
483584
if not self.ton.using_alert_bot():

0 commit comments

Comments
 (0)