Skip to content

Commit 1dd3476

Browse files
authored
feat(crons): Use timeout_at for timed out check-ins (#52570)
Queries off of timestamp of `timeout_at` for in progress check-ins
1 parent e21c9ad commit 1dd3476

File tree

2 files changed

+21
-13
lines changed

2 files changed

+21
-13
lines changed

src/sentry/monitors/tasks.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import logging
2-
from datetime import timedelta
32

43
from django.utils import timezone
54

@@ -102,21 +101,13 @@ def check_monitors(current_datetime=None):
102101
except Exception:
103102
logger.exception("Exception in check_monitors - mark missed")
104103

105-
qs = MonitorCheckIn.objects.filter(status=CheckInStatus.IN_PROGRESS).select_related(
106-
"monitor", "monitor_environment"
107-
)[:CHECKINS_LIMIT]
104+
qs = MonitorCheckIn.objects.filter(
105+
status=CheckInStatus.IN_PROGRESS, timeout_at__lte=current_datetime
106+
).select_related("monitor", "monitor_environment")[:CHECKINS_LIMIT]
108107
metrics.gauge("sentry.monitors.tasks.check_monitors.timeout_count", qs.count())
109108
# check for any monitors which are still running and have exceeded their maximum runtime
110109
for checkin in qs:
111110
try:
112-
timeout = timedelta(
113-
minutes=(checkin.monitor.config or {}).get("max_runtime") or TIMEOUT
114-
)
115-
# Check against date_updated to allow monitors to run for longer as
116-
# long as they continue to send heart beats updating the checkin
117-
if checkin.date_updated > current_datetime - timeout:
118-
continue
119-
120111
monitor_environment = checkin.monitor_environment
121112
logger.info(
122113
"monitor_environment.checkin-timeout",
@@ -136,7 +127,16 @@ def check_monitors(current_datetime=None):
136127
if not has_newer_result:
137128
monitor_environment.mark_failed(
138129
reason=MonitorFailure.DURATION,
139-
occurrence_context={"duration": (timeout.seconds // 60) % 60},
130+
occurrence_context={
131+
"duration": (checkin.monitor.config or {}).get("max_runtime") or TIMEOUT
132+
},
140133
)
141134
except Exception:
142135
logger.exception("Exception in check_monitors - mark timeout")
136+
137+
# safety check for check-ins stuck in the backlog
138+
backlog_count = MonitorCheckIn.objects.filter(
139+
status=CheckInStatus.IN_PROGRESS, timeout_at__isnull=True
140+
).count()
141+
if backlog_count:
142+
logger.exception(f"Exception in check_monitors - backlog count {backlog_count} is > 0")

tests/sentry/monitors/test_tasks.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ def test_timeout_with_no_future_complete_checkin(self):
206206
status=CheckInStatus.IN_PROGRESS,
207207
date_added=check_in_24hr_ago,
208208
date_updated=check_in_24hr_ago,
209+
timeout_at=check_in_24hr_ago + timedelta(minutes=30),
209210
)
210211
# We started another checkin right now
211212
checkin2 = MonitorCheckIn.objects.create(
@@ -215,6 +216,7 @@ def test_timeout_with_no_future_complete_checkin(self):
215216
status=CheckInStatus.IN_PROGRESS,
216217
date_added=next_checkin_ts,
217218
date_updated=next_checkin_ts,
219+
timeout_at=next_checkin_ts + timedelta(minutes=30),
218220
)
219221

220222
assert checkin1.date_added == checkin1.date_updated == check_in_24hr_ago
@@ -268,6 +270,7 @@ def test_timeout_with_future_complete_checkin(self):
268270
status=CheckInStatus.IN_PROGRESS,
269271
date_added=check_in_24hr_ago,
270272
date_updated=check_in_24hr_ago,
273+
timeout_at=check_in_24hr_ago + timedelta(minutes=30),
271274
)
272275
checkin2 = MonitorCheckIn.objects.create(
273276
monitor=monitor,
@@ -276,6 +279,7 @@ def test_timeout_with_future_complete_checkin(self):
276279
status=CheckInStatus.OK,
277280
date_added=next_checkin_ts,
278281
date_updated=next_checkin_ts,
282+
timeout_at=next_checkin_ts + timedelta(minutes=30),
279283
)
280284

281285
assert checkin1.date_added == checkin1.date_updated == check_in_24hr_ago
@@ -321,6 +325,7 @@ def test_timeout_via_max_runtime_configuration(self):
321325
status=CheckInStatus.IN_PROGRESS,
322326
date_added=next_checkin_ts,
323327
date_updated=next_checkin_ts,
328+
timeout_at=next_checkin_ts + timedelta(minutes=60),
324329
)
325330

326331
assert checkin.date_added == checkin.date_updated == next_checkin_ts
@@ -424,6 +429,7 @@ def test_timeout_exception_handling(self, logger):
424429
status=CheckInStatus.IN_PROGRESS,
425430
date_added=check_in_24hr_ago,
426431
date_updated=check_in_24hr_ago,
432+
timeout_at=check_in_24hr_ago + timedelta(minutes=30),
427433
)
428434

429435
# This monitor will be fine
@@ -448,6 +454,7 @@ def test_timeout_exception_handling(self, logger):
448454
status=CheckInStatus.IN_PROGRESS,
449455
date_added=check_in_24hr_ago,
450456
date_updated=check_in_24hr_ago,
457+
timeout_at=check_in_24hr_ago + timedelta(minutes=30),
451458
)
452459
checkin2 = MonitorCheckIn.objects.create(
453460
monitor=monitor,
@@ -456,6 +463,7 @@ def test_timeout_exception_handling(self, logger):
456463
status=CheckInStatus.IN_PROGRESS,
457464
date_added=next_checkin_ts,
458465
date_updated=next_checkin_ts,
466+
timeout_at=next_checkin_ts + timedelta(minutes=30),
459467
)
460468

461469
assert checkin1.date_added == checkin1.date_updated == check_in_24hr_ago

0 commit comments

Comments
 (0)