Skip to content

Commit 044fb56

Browse files
authored
Merge pull request #118 from MannLabs/add_instrument_stall
add instrument stall alert
2 parents 636e0bb + 2eb757b commit 044fb56

File tree

4 files changed

+90
-0
lines changed

4 files changed

+90
-0
lines changed

monitoring/alert_manager.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
DiskSpaceAlert,
1111
HealthCheckFailedAlert,
1212
InstrumentFilePileUpAlert,
13+
InstrumentStallAlert,
1314
PumpPressureAlert,
1415
RawFileErrorAlert,
1516
S3UploadFailureAlert,
@@ -45,6 +46,7 @@ def __init__(self):
4546
HealthCheckFailedAlert(),
4647
StatusPileUpAlert(),
4748
InstrumentFilePileUpAlert(),
49+
InstrumentStallAlert(),
4850
RawFileErrorAlert(),
4951
S3UploadFailureAlert(),
5052
WebAppHealthAlert(),

monitoring/alerts/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from .disk_space_alert import DiskSpaceAlert
55
from .health_check_failed_alert import HealthCheckFailedAlert
66
from .instrument_file_pile_up_alert import InstrumentFilePileUpAlert
7+
from .instrument_stall_alert import InstrumentStallAlert
78
from .pump_pressure_alert import PumpPressureAlert
89
from .raw_file_error_alert import RawFileErrorAlert
910
from .s3_upload_failure_alert import S3UploadFailureAlert
@@ -16,6 +17,7 @@
1617
"DiskSpaceAlert",
1718
"HealthCheckFailedAlert",
1819
"InstrumentFilePileUpAlert",
20+
"InstrumentStallAlert",
1921
"PumpPressureAlert",
2022
"RawFileErrorAlert",
2123
"S3UploadFailureAlert",

monitoring/alerts/config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@
3737
}
3838
INSTRUMENT_FILE_MIN_AGE_HOURS = 6 # Only consider files older than 6 hours
3939

40+
# Instrument stall alert configuration
41+
INSTRUMENT_STALL_THRESHOLD_HOURS = 2 # How long without a new file before alerting
42+
INSTRUMENT_STALL_INSTRUMENT_IDS: set[str] = set() # Instruments to monitor
43+
4044
# Pump pressure alert configuration
4145
PUMP_PRESSURE_LOOKBACK_DAYS = 1
4246
PUMP_PRESSURE_WINDOW_SIZE = 5 # Number of samples to compare
@@ -75,3 +79,4 @@ class Cases:
7579
S3_UPLOAD_FAILURE = "s3_upload_failure"
7680
WEBAPP_HEALTH = "webapp_health"
7781
PUMP_PRESSURE_INCREASE = "pump_pressure_increase"
82+
INSTRUMENT_STALL = "instrument_stall"
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
"""Instrument stall alert checker."""
2+
3+
import logging
4+
from datetime import datetime, timedelta
5+
6+
import pytz
7+
8+
from shared.db.models import KrakenStatus, RawFile
9+
10+
from .base_alert import BaseAlert
11+
from .config import (
12+
INSTRUMENT_STALL_INSTRUMENT_IDS,
13+
INSTRUMENT_STALL_THRESHOLD_HOURS,
14+
Cases,
15+
)
16+
17+
18+
class InstrumentStallAlert(BaseAlert):
19+
"""Check if no new file has been acquired for a configured time period."""
20+
21+
def __init__(self):
22+
"""Initialize with empty alerted file names tracker."""
23+
self._alerted_file_names: dict[str, str] = {}
24+
25+
@property
26+
def name(self) -> str:
27+
"""Return the case name for this alert type."""
28+
return Cases.INSTRUMENT_STALL
29+
30+
def _get_issues(
31+
self, status_objects: list[KrakenStatus]
32+
) -> list[tuple[str, tuple[str, datetime]]]:
33+
"""Check for instruments with no recent file acquisition."""
34+
del status_objects
35+
36+
if not INSTRUMENT_STALL_INSTRUMENT_IDS:
37+
return []
38+
39+
now = datetime.now(pytz.UTC)
40+
threshold = now - timedelta(hours=INSTRUMENT_STALL_THRESHOLD_HOURS)
41+
42+
issues = []
43+
for instrument_id in INSTRUMENT_STALL_INSTRUMENT_IDS:
44+
latest = (
45+
RawFile.objects.filter(instrument_id=instrument_id)
46+
.order_by("-created_at")
47+
.only("original_name", "created_at")
48+
.first()
49+
)
50+
51+
if not latest:
52+
logging.debug(f"No raw files found for instrument {instrument_id}")
53+
continue
54+
55+
created_at = latest.created_at
56+
if created_at.tzinfo is None:
57+
created_at = pytz.utc.localize(created_at)
58+
59+
if created_at < threshold:
60+
if self._alerted_file_names.get(instrument_id) == latest.original_name:
61+
continue
62+
issues.append((instrument_id, (latest.original_name, created_at)))
63+
self._alerted_file_names[instrument_id] = latest.original_name
64+
else:
65+
self._alerted_file_names.pop(instrument_id, None)
66+
67+
return issues
68+
69+
def format_message(self, issues: list[tuple[str, tuple[str, datetime]]]) -> str:
70+
"""Format no new file acquired message."""
71+
now = datetime.now(pytz.UTC)
72+
lines = []
73+
for instrument_id, (original_name, created_at) in issues:
74+
hours_ago = (now - created_at).total_seconds() / 3600
75+
lines.append(
76+
f'- `{instrument_id}`: last file "{original_name}" '
77+
f"acquired at {created_at.strftime('%Y-%m-%d %H:%M:%S')} UTC "
78+
f"({hours_ago:.1f} hours ago)"
79+
)
80+
instruments_str = "\n".join(lines)
81+
return f"No new file acquired:\n{instruments_str}"

0 commit comments

Comments
 (0)