Skip to content

Commit 1981d6f

Browse files
kddubeyandrewshie-sentry
authored andcommitted
fix(seer): Temp log GPU fixability failures instead of raising (#97657)
while i debug the GPU `group-seer` deployment in s4s, let's avoid creating Sentry issues and blocking deploys
1 parent 1d86adf commit 1981d6f

File tree

2 files changed

+57
-9
lines changed

2 files changed

+57
-9
lines changed

src/sentry/seer/autofix/issue_summary.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -185,27 +185,42 @@ def _call_seer(
185185
)
186186

187187

188-
def _generate_fixability_score(group: Group):
188+
def _generate_fixability_score(group: Group) -> SummarizeIssueResponse | None:
189189
payload = {
190190
"group_id": group.id,
191191
"organization_slug": group.organization.slug,
192192
"organization_id": group.organization.id,
193193
"project_id": group.project.id,
194194
}
195195

196-
if in_random_rollout("issues.fixability.gpu-rollout-rate"):
196+
use_gpu = in_random_rollout("issues.fixability.gpu-rollout-rate")
197+
if use_gpu:
197198
connection_pool = fixability_connection_pool_gpu
198199
else:
199200
connection_pool = fixability_connection_pool
200201

201-
response = make_signed_seer_api_request(
202-
connection_pool,
203-
"/v1/automation/summarize/fixability",
204-
body=orjson.dumps(payload, option=orjson.OPT_NON_STR_KEYS),
205-
timeout=settings.SEER_FIXABILITY_TIMEOUT,
206-
)
202+
# TODO(kddubey): rm this handling once we verify that the GPU deployment works
203+
try:
204+
response = make_signed_seer_api_request(
205+
connection_pool,
206+
"/v1/automation/summarize/fixability",
207+
body=orjson.dumps(payload, option=orjson.OPT_NON_STR_KEYS),
208+
timeout=settings.SEER_FIXABILITY_TIMEOUT,
209+
)
210+
except Exception:
211+
if not use_gpu:
212+
raise
213+
else:
214+
logger.warning("GPU fixability connection failed", exc_info=True)
215+
return None
216+
207217
if response.status >= 400:
208-
raise Exception(f"Seer API error: {response.status}")
218+
if not use_gpu:
219+
raise Exception(f"Seer API error: {response.status}")
220+
else:
221+
logger.warning("GPU fixability endpoint failed", extra={"status": response.status})
222+
return None
223+
209224
response_data = orjson.loads(response.data)
210225
return SummarizeIssueResponse.validate(response_data)
211226

@@ -302,6 +317,9 @@ def _run_automation(
302317
with sentry_sdk.start_span(op="ai_summary.generate_fixability_score"):
303318
issue_summary = _generate_fixability_score(group)
304319

320+
if not issue_summary:
321+
return
322+
305323
if not issue_summary.scores:
306324
raise ValueError("Issue summary scores is None or empty.")
307325
if issue_summary.scores.fixability_score is None:

tests/sentry/seer/autofix/test_issue_summary.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,36 @@ def test_run_automation_saves_fixability_score(
572572
self.group.refresh_from_db()
573573
assert self.group.seer_fixability_score == 0.5
574574

575+
@patch("sentry.seer.autofix.issue_summary._trigger_autofix_task.delay")
576+
@patch("sentry.seer.autofix.issue_summary.get_autofix_state")
577+
@patch("sentry.seer.autofix.issue_summary._generate_fixability_score")
578+
def test_run_automation_handles_none_fixability_score(
579+
self,
580+
mock_generate_fixability_score,
581+
mock_get_autofix_state,
582+
mock_trigger_autofix_task,
583+
):
584+
"""Test that _run_automation returns early when _generate_fixability_score returns None (GPU failure case)."""
585+
self.group.project.update_option("sentry:autofix_automation_tuning", "high")
586+
mock_event = Mock(event_id="test_event_id")
587+
mock_user = self.user
588+
589+
mock_generate_fixability_score.return_value = None
590+
mock_get_autofix_state.return_value = None
591+
592+
self.group.refresh_from_db()
593+
initial_fixability_score = self.group.seer_fixability_score
594+
595+
_run_automation(self.group, mock_user, mock_event, source=SeerAutomationSource.POST_PROCESS)
596+
597+
mock_generate_fixability_score.assert_called_once_with(self.group)
598+
mock_trigger_autofix_task.assert_not_called()
599+
600+
self.group.refresh_from_db()
601+
assert self.group.seer_fixability_score == initial_fixability_score
602+
603+
mock_get_autofix_state.assert_not_called()
604+
575605
@patch("sentry.seer.autofix.issue_summary._trigger_autofix_task.delay")
576606
@patch("sentry.seer.autofix.issue_summary.get_autofix_state")
577607
@patch("sentry.seer.autofix.issue_summary._generate_fixability_score")

0 commit comments

Comments
 (0)