Skip to content

Commit c3b32a7

Browse files
ref(crons): Reorganize incident creation / issue occurrence logic (#80528)
Since we'll be doing more with issue occurrences split out the concept of incidents into it's own logic module, as well as incident_occurrence into it's own module Part of GH-80527
1 parent 5fc9f67 commit c3b32a7

File tree

4 files changed

+289
-264
lines changed

4 files changed

+289
-264
lines changed
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
import uuid
5+
from collections import Counter
6+
from collections.abc import Mapping, Sequence
7+
from datetime import datetime, timezone
8+
from typing import TYPE_CHECKING
9+
10+
from django.utils.text import get_text_list
11+
from django.utils.translation import gettext_lazy as _
12+
13+
from sentry.issues.grouptype import MonitorIncidentType
14+
from sentry.monitors.models import (
15+
CheckInStatus,
16+
MonitorCheckIn,
17+
MonitorEnvironment,
18+
MonitorIncident,
19+
)
20+
from sentry.monitors.types import SimpleCheckIn
21+
22+
if TYPE_CHECKING:
23+
from django.utils.functional import _StrPromise
24+
25+
logger = logging.getLogger(__name__)
26+
27+
28+
def create_incident_occurrence(
29+
failed_checkins: Sequence[SimpleCheckIn],
30+
failed_checkin: MonitorCheckIn,
31+
incident: MonitorIncident,
32+
received: datetime | None,
33+
) -> None:
34+
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
35+
from sentry.issues.producer import PayloadType, produce_occurrence_to_kafka
36+
37+
monitor_env = failed_checkin.monitor_environment
38+
39+
if monitor_env is None:
40+
return
41+
42+
current_timestamp = datetime.now(timezone.utc)
43+
44+
# Get last successful check-in to show in evidence display
45+
last_successful_checkin_timestamp = "Never"
46+
last_successful_checkin = monitor_env.get_last_successful_checkin()
47+
if last_successful_checkin:
48+
last_successful_checkin_timestamp = last_successful_checkin.date_added.isoformat()
49+
50+
occurrence = IssueOccurrence(
51+
id=uuid.uuid4().hex,
52+
resource_id=None,
53+
project_id=monitor_env.monitor.project_id,
54+
event_id=uuid.uuid4().hex,
55+
fingerprint=[incident.grouphash],
56+
type=MonitorIncidentType,
57+
issue_title=f"Monitor failure: {monitor_env.monitor.name}",
58+
subtitle="Your monitor has reached its failure threshold.",
59+
evidence_display=[
60+
IssueEvidence(
61+
name="Failure reason",
62+
value=str(get_failure_reason(failed_checkins)),
63+
important=True,
64+
),
65+
IssueEvidence(
66+
name="Environment",
67+
value=monitor_env.get_environment().name,
68+
important=False,
69+
),
70+
IssueEvidence(
71+
name="Last successful check-in",
72+
value=last_successful_checkin_timestamp,
73+
important=False,
74+
),
75+
],
76+
evidence_data={},
77+
culprit="",
78+
detection_time=current_timestamp,
79+
level="error",
80+
assignee=monitor_env.monitor.owner_actor,
81+
)
82+
83+
if failed_checkin.trace_id:
84+
trace_id = failed_checkin.trace_id.hex
85+
else:
86+
trace_id = None
87+
88+
event_data = {
89+
"contexts": {"monitor": get_monitor_environment_context(monitor_env)},
90+
"environment": monitor_env.get_environment().name,
91+
"event_id": occurrence.event_id,
92+
"fingerprint": [incident.grouphash],
93+
"platform": "other",
94+
"project_id": monitor_env.monitor.project_id,
95+
# We set this to the time that the checkin that triggered the occurrence was written to relay if available
96+
"received": (received if received else current_timestamp).isoformat(),
97+
"sdk": None,
98+
"tags": {
99+
"monitor.id": str(monitor_env.monitor.guid),
100+
"monitor.slug": str(monitor_env.monitor.slug),
101+
"monitor.incident": str(incident.id),
102+
},
103+
"timestamp": current_timestamp.isoformat(),
104+
}
105+
106+
if trace_id:
107+
event_data["contexts"]["trace"] = {"trace_id": trace_id, "span_id": None}
108+
109+
produce_occurrence_to_kafka(
110+
payload_type=PayloadType.OCCURRENCE,
111+
occurrence=occurrence,
112+
event_data=event_data,
113+
)
114+
115+
116+
HUMAN_FAILURE_STATUS_MAP: Mapping[int, _StrPromise] = {
117+
CheckInStatus.ERROR: _("error"),
118+
CheckInStatus.MISSED: _("missed"),
119+
CheckInStatus.TIMEOUT: _("timeout"),
120+
}
121+
122+
# Exists due to the vowel differences (A vs An) in the statuses
123+
SINGULAR_HUMAN_FAILURE_MAP: Mapping[int, _StrPromise] = {
124+
CheckInStatus.ERROR: _("An error check-in was detected"),
125+
CheckInStatus.MISSED: _("A missed check-in was detected"),
126+
CheckInStatus.TIMEOUT: _("A timeout check-in was detected"),
127+
}
128+
129+
130+
def get_failure_reason(failed_checkins: Sequence[SimpleCheckIn]):
131+
"""
132+
Builds a humam readible string from a list of failed check-ins.
133+
134+
"3 missed check-ins detected"
135+
"2 missed check-ins, 1 timeout check-in and 1 error check-in were detected"
136+
"A failed check-in was detected"
137+
"""
138+
139+
status_counts = Counter(
140+
checkin["status"]
141+
for checkin in failed_checkins
142+
if checkin["status"] in HUMAN_FAILURE_STATUS_MAP.keys()
143+
)
144+
145+
if sum(status_counts.values()) == 1:
146+
return SINGULAR_HUMAN_FAILURE_MAP[list(status_counts.keys())[0]]
147+
148+
human_status = get_text_list(
149+
[
150+
"%(count)d %(status)s" % {"count": count, "status": HUMAN_FAILURE_STATUS_MAP[status]}
151+
for status, count in status_counts.items()
152+
],
153+
last_word=_("and"),
154+
)
155+
156+
return _("%(problem_checkins)s check-ins detected") % {"problem_checkins": human_status}
157+
158+
159+
def get_monitor_environment_context(monitor_environment: MonitorEnvironment):
160+
config = monitor_environment.monitor.config.copy()
161+
if "schedule_type" in config:
162+
config["schedule_type"] = monitor_environment.monitor.get_schedule_type_display()
163+
164+
return {
165+
"id": str(monitor_environment.monitor.guid),
166+
"slug": str(monitor_environment.monitor.slug),
167+
"name": monitor_environment.monitor.name,
168+
"config": monitor_environment.monitor.config,
169+
"status": monitor_environment.get_status_display(),
170+
"type": monitor_environment.monitor.get_type_display(),
171+
}
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
from datetime import datetime
5+
from typing import cast
6+
7+
from sentry.monitors.logic.incident_occurrence import create_incident_occurrence
8+
from sentry.monitors.models import CheckInStatus, MonitorCheckIn, MonitorIncident, MonitorStatus
9+
from sentry.monitors.types import SimpleCheckIn
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
def try_incident_threshold(
15+
failed_checkin: MonitorCheckIn,
16+
failure_issue_threshold: int,
17+
received: datetime | None,
18+
) -> bool:
19+
from sentry.signals import monitor_environment_failed
20+
21+
monitor_env = failed_checkin.monitor_environment
22+
23+
if monitor_env is None:
24+
return False
25+
26+
# check to see if we need to update the status
27+
if monitor_env.status in [MonitorStatus.OK, MonitorStatus.ACTIVE]:
28+
if failure_issue_threshold == 1:
29+
previous_checkins: list[SimpleCheckIn] = [
30+
{
31+
"id": failed_checkin.id,
32+
"date_added": failed_checkin.date_added,
33+
"status": failed_checkin.status,
34+
}
35+
]
36+
else:
37+
previous_checkins = cast(
38+
list[SimpleCheckIn],
39+
# Using .values for performance reasons
40+
MonitorCheckIn.objects.filter(
41+
monitor_environment=monitor_env, date_added__lte=failed_checkin.date_added
42+
)
43+
.order_by("-date_added")
44+
.values("id", "date_added", "status"),
45+
)
46+
47+
# reverse the list after slicing in order to start with oldest check-in
48+
previous_checkins = list(reversed(previous_checkins[:failure_issue_threshold]))
49+
50+
# If we have any successful check-ins within the threshold of
51+
# commits we have NOT reached an incident state
52+
if any([checkin["status"] == CheckInStatus.OK for checkin in previous_checkins]):
53+
return False
54+
55+
# change monitor status + update fingerprint timestamp
56+
monitor_env.status = MonitorStatus.ERROR
57+
monitor_env.save(update_fields=("status",))
58+
59+
starting_checkin = previous_checkins[0]
60+
61+
incident: MonitorIncident | None
62+
incident, _ = MonitorIncident.objects.get_or_create(
63+
monitor_environment=monitor_env,
64+
resolving_checkin=None,
65+
defaults={
66+
"monitor": monitor_env.monitor,
67+
"starting_checkin_id": starting_checkin["id"],
68+
"starting_timestamp": starting_checkin["date_added"],
69+
},
70+
)
71+
72+
elif monitor_env.status == MonitorStatus.ERROR:
73+
# if monitor environment has a failed status, use the failed
74+
# check-in and send occurrence
75+
previous_checkins = [
76+
{
77+
"id": failed_checkin.id,
78+
"date_added": failed_checkin.date_added,
79+
"status": failed_checkin.status,
80+
}
81+
]
82+
83+
# get the active incident from the monitor environment
84+
incident = monitor_env.active_incident
85+
else:
86+
# don't send occurrence for other statuses
87+
return False
88+
89+
# Only create an occurrence if:
90+
# - We have an active incident and fingerprint
91+
# - The monitor and env are not muted
92+
if not monitor_env.monitor.is_muted and not monitor_env.is_muted and incident:
93+
checkins = MonitorCheckIn.objects.filter(id__in=[c["id"] for c in previous_checkins])
94+
for checkin in checkins:
95+
create_incident_occurrence(
96+
previous_checkins,
97+
checkin,
98+
incident,
99+
received=received,
100+
)
101+
102+
monitor_environment_failed.send(monitor_environment=monitor_env, sender=type(monitor_env))
103+
104+
return True

0 commit comments

Comments
 (0)