Skip to content

Commit d7e3045

Browse files
authored
Merge branch 'master' into fix/prevent-status-overwrite
2 parents 163e343 + 6b211b8 commit d7e3045

File tree

2 files changed

+138
-9
lines changed

2 files changed

+138
-9
lines changed

mod_ci/controllers.py

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -584,22 +584,28 @@ def mark_test_failed(db, test, repository, message: str) -> bool:
584584
# Continue to try GitHub update even if DB fails
585585

586586
# Step 2: Try to update GitHub status (CRITICAL - must not be skipped)
587+
# Use retry logic since this is critical to prevent stuck "pending" status
587588
try:
588-
gh_commit = repository.get_commit(test.commit)
589-
# Include target_url so the status links to the test page
589+
# Build target_url first (doesn't need retry)
590590
from flask import url_for
591591
try:
592592
target_url = url_for('test.by_id', test_id=test.id, _external=True)
593593
except RuntimeError:
594594
# Outside of request context
595595
target_url = f"https://sampleplatform.ccextractor.org/test/{test.id}"
596-
update_status_on_github(gh_commit, Status.ERROR, message, f"CI - {test.platform.value}", target_url)
596+
597+
# Use retry_with_backoff for GitHub API calls
598+
def update_github_status():
599+
gh_commit = repository.get_commit(test.commit)
600+
update_status_on_github(gh_commit, Status.ERROR, message, f"CI - {test.platform.value}", target_url)
601+
602+
retry_with_backoff(update_github_status, max_retries=3, initial_backoff=2.0)
597603
github_success = True
598604
log.info(f"Test {test.id}: GitHub status updated to ERROR: {message}")
599605
except GithubException as e:
600-
log.error(f"Test {test.id}: GitHub API error while updating status: {e.status} - {e.data}")
606+
log.error(f"Test {test.id}: GitHub API error while updating status (after retries): {e.status} - {e.data}")
601607
except Exception as e:
602-
log.error(f"Test {test.id}: Failed to update GitHub status: {type(e).__name__}: {e}")
608+
log.error(f"Test {test.id}: Failed to update GitHub status (after retries): {type(e).__name__}: {e}")
603609

604610
# Log final status
605611
if db_success and github_success:
@@ -663,8 +669,33 @@ def _diagnose_missing_artifact(repository, commit_sha: str, platform, log) -> tu
663669
f"'{workflow_run.conclusion}'. Check the GitHub Actions logs for details.")
664670
return (message, False) # Not retryable
665671
else:
666-
# Build succeeded but artifact not found - may have expired
667-
# This is a PERMANENT failure
672+
# Build succeeded but artifact not found
673+
# Check if the build completed very recently - if so, this might be
674+
# GitHub API propagation delay (artifact exists but not visible yet)
675+
# In that case, treat as retryable
676+
ARTIFACT_PROPAGATION_GRACE_PERIOD = 300 # 5 minutes in seconds
677+
try:
678+
from datetime import datetime, timezone
679+
now = datetime.now(timezone.utc)
680+
# workflow_run.updated_at is when the run completed
681+
if workflow_run.updated_at:
682+
completed_at = workflow_run.updated_at
683+
if completed_at.tzinfo is None:
684+
completed_at = completed_at.replace(tzinfo=timezone.utc)
685+
seconds_since_completion = (now - completed_at).total_seconds()
686+
if seconds_since_completion < ARTIFACT_PROPAGATION_GRACE_PERIOD:
687+
message = (f"Build completed recently ({int(seconds_since_completion)}s ago): "
688+
f"'{expected_workflow}' succeeded but artifact not yet visible. "
689+
f"Will retry (GitHub API propagation delay).")
690+
log.info(f"Artifact not found but build completed {int(seconds_since_completion)}s ago - "
691+
f"treating as retryable (possible API propagation delay)")
692+
return (message, True) # Retryable - API propagation delay
693+
except Exception as e:
694+
log.warning(f"Could not check workflow completion time: {e}")
695+
# Fall through to permanent failure
696+
697+
# Build completed more than 5 minutes ago - artifact should be visible
698+
# This is a PERMANENT failure (artifact expired or not uploaded)
668699
message = (f"Artifact not found: '{expected_workflow}' completed successfully, "
669700
f"but no artifact was found. The artifact may have expired (GitHub deletes "
670701
f"artifacts after a retention period) or was not uploaded properly.")
@@ -1959,8 +1990,16 @@ def progress_type_request(log, test, test_id, request) -> bool:
19591990
else:
19601991
message = progress.message
19611992

1962-
gh_commit = repository.get_commit(test.commit)
1963-
update_status_on_github(gh_commit, state, message, context, target_url=target_url)
1993+
# Use retry logic for final GitHub status update to prevent stuck "pending" states
1994+
# This is critical - if this fails, the PR will show "Tests queued" forever
1995+
try:
1996+
def update_final_status():
1997+
gh_commit = repository.get_commit(test.commit)
1998+
update_status_on_github(gh_commit, state, message, context, target_url=target_url)
1999+
2000+
retry_with_backoff(update_final_status, max_retries=3, initial_backoff=2.0)
2001+
except Exception as e:
2002+
log.error(f"Test {test_id}: Failed to update final GitHub status after retries: {e}")
19642003

19652004
if status in [TestStatus.completed, TestStatus.canceled]:
19662005
# Delete the current instance

tests/test_ci/test_controllers.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3094,6 +3094,35 @@ def test_mark_test_failed_includes_target_url(self, mock_progress, mock_update_g
30943094
self.assertEqual(len(call_args[0]), 5) # 5 positional args including target_url
30953095
self.assertIn("456", call_args[0][4]) # target_url contains test ID
30963096

3097+
@mock.patch('mod_ci.controllers.update_status_on_github')
3098+
@mock.patch('mod_ci.controllers.TestProgress')
3099+
@mock.patch('mod_ci.controllers.retry_with_backoff')
3100+
def test_mark_test_failed_uses_retry_for_github(self, mock_retry, mock_progress, mock_update_github):
3101+
"""Test that mark_test_failed uses retry_with_backoff for GitHub status update."""
3102+
from mod_ci.controllers import mark_test_failed
3103+
3104+
db = MagicMock()
3105+
test = MagicMock()
3106+
test.id = 789
3107+
test.commit = "abc123"
3108+
test.platform.value = "linux"
3109+
3110+
repository = MagicMock()
3111+
gh_commit = MagicMock()
3112+
repository.get_commit.return_value = gh_commit
3113+
3114+
# Make retry_with_backoff call the function it receives
3115+
mock_retry.side_effect = lambda func, **kwargs: func()
3116+
3117+
with mock.patch('run.log'):
3118+
mark_test_failed(db, test, repository, "Test error")
3119+
3120+
# Verify retry_with_backoff was called with correct parameters
3121+
mock_retry.assert_called_once()
3122+
call_kwargs = mock_retry.call_args[1]
3123+
self.assertEqual(call_kwargs['max_retries'], 3)
3124+
self.assertEqual(call_kwargs['initial_backoff'], 2.0)
3125+
30973126

30983127
class TestParseGcpError(unittest.TestCase):
30993128
"""Tests for the parse_gcp_error helper function."""
@@ -3361,6 +3390,67 @@ def test_artifact_expired_is_not_retryable(self):
33613390
self.assertIn("Artifact not found", message)
33623391
self.assertIn("expired", message)
33633392

3393+
def test_recently_completed_build_is_retryable(self):
3394+
"""Test that recently completed build (within grace period) returns retryable=True."""
3395+
from datetime import datetime, timedelta, timezone
3396+
3397+
from mod_ci.controllers import _diagnose_missing_artifact
3398+
3399+
repository = MagicMock()
3400+
log = MagicMock()
3401+
3402+
# Mock workflow
3403+
workflow = MagicMock()
3404+
workflow.id = 1
3405+
workflow.name = "Build CCExtractor on Linux"
3406+
repository.get_workflows.return_value = [workflow]
3407+
3408+
# Mock workflow run - completed successfully but very recently (1 minute ago)
3409+
workflow_run = MagicMock()
3410+
workflow_run.workflow_id = 1
3411+
workflow_run.status = "completed"
3412+
workflow_run.conclusion = "success"
3413+
workflow_run.updated_at = datetime.now(timezone.utc) - timedelta(seconds=60)
3414+
repository.get_workflow_runs.return_value = [workflow_run]
3415+
3416+
message, is_retryable = _diagnose_missing_artifact(
3417+
repository, "abc123", TestPlatform.linux, log
3418+
)
3419+
3420+
self.assertTrue(is_retryable)
3421+
self.assertIn("Build completed recently", message)
3422+
self.assertIn("Will retry", message)
3423+
3424+
def test_old_completed_build_is_not_retryable(self):
3425+
"""Test that build completed more than grace period ago returns retryable=False."""
3426+
from datetime import datetime, timedelta, timezone
3427+
3428+
from mod_ci.controllers import _diagnose_missing_artifact
3429+
3430+
repository = MagicMock()
3431+
log = MagicMock()
3432+
3433+
# Mock workflow
3434+
workflow = MagicMock()
3435+
workflow.id = 1
3436+
workflow.name = "Build CCExtractor on Linux"
3437+
repository.get_workflows.return_value = [workflow]
3438+
3439+
# Mock workflow run - completed 10 minutes ago (beyond grace period)
3440+
workflow_run = MagicMock()
3441+
workflow_run.workflow_id = 1
3442+
workflow_run.status = "completed"
3443+
workflow_run.conclusion = "success"
3444+
workflow_run.updated_at = datetime.now(timezone.utc) - timedelta(seconds=600)
3445+
repository.get_workflow_runs.return_value = [workflow_run]
3446+
3447+
message, is_retryable = _diagnose_missing_artifact(
3448+
repository, "abc123", TestPlatform.linux, log
3449+
)
3450+
3451+
self.assertFalse(is_retryable)
3452+
self.assertIn("Artifact not found", message)
3453+
33643454
def test_no_workflow_run_is_retryable(self):
33653455
"""Test that missing workflow run returns retryable=True."""
33663456
from mod_ci.controllers import _diagnose_missing_artifact

0 commit comments

Comments
 (0)