diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py index cc0584e2..2b0b4b66 100755 --- a/mod_ci/controllers.py +++ b/mod_ci/controllers.py @@ -584,22 +584,28 @@ def mark_test_failed(db, test, repository, message: str) -> bool: # Continue to try GitHub update even if DB fails # Step 2: Try to update GitHub status (CRITICAL - must not be skipped) + # Use retry logic since this is critical to prevent stuck "pending" status try: - gh_commit = repository.get_commit(test.commit) - # Include target_url so the status links to the test page + # Build target_url first (doesn't need retry) from flask import url_for try: target_url = url_for('test.by_id', test_id=test.id, _external=True) except RuntimeError: # Outside of request context target_url = f"https://sampleplatform.ccextractor.org/test/{test.id}" - update_status_on_github(gh_commit, Status.ERROR, message, f"CI - {test.platform.value}", target_url) + + # Use retry_with_backoff for GitHub API calls + def update_github_status(): + gh_commit = repository.get_commit(test.commit) + update_status_on_github(gh_commit, Status.ERROR, message, f"CI - {test.platform.value}", target_url) + + retry_with_backoff(update_github_status, max_retries=3, initial_backoff=2.0) github_success = True log.info(f"Test {test.id}: GitHub status updated to ERROR: {message}") except GithubException as e: - log.error(f"Test {test.id}: GitHub API error while updating status: {e.status} - {e.data}") + log.error(f"Test {test.id}: GitHub API error while updating status (after retries): {e.status} - {e.data}") except Exception as e: - log.error(f"Test {test.id}: Failed to update GitHub status: {type(e).__name__}: {e}") + log.error(f"Test {test.id}: Failed to update GitHub status (after retries): {type(e).__name__}: {e}") # Log final status if db_success and github_success: @@ -663,8 +669,33 @@ def _diagnose_missing_artifact(repository, commit_sha: str, platform, log) -> tu f"'{workflow_run.conclusion}'. Check the GitHub Actions logs for details.") return (message, False) # Not retryable else: - # Build succeeded but artifact not found - may have expired - # This is a PERMANENT failure + # Build succeeded but artifact not found + # Check if the build completed very recently - if so, this might be + # GitHub API propagation delay (artifact exists but not visible yet) + # In that case, treat as retryable + ARTIFACT_PROPAGATION_GRACE_PERIOD = 300 # 5 minutes in seconds + try: + from datetime import datetime, timezone + now = datetime.now(timezone.utc) + # workflow_run.updated_at is when the run completed + if workflow_run.updated_at: + completed_at = workflow_run.updated_at + if completed_at.tzinfo is None: + completed_at = completed_at.replace(tzinfo=timezone.utc) + seconds_since_completion = (now - completed_at).total_seconds() + if seconds_since_completion < ARTIFACT_PROPAGATION_GRACE_PERIOD: + message = (f"Build completed recently ({int(seconds_since_completion)}s ago): " + f"'{expected_workflow}' succeeded but artifact not yet visible. " + f"Will retry (GitHub API propagation delay).") + log.info(f"Artifact not found but build completed {int(seconds_since_completion)}s ago - " + f"treating as retryable (possible API propagation delay)") + return (message, True) # Retryable - API propagation delay + except Exception as e: + log.warning(f"Could not check workflow completion time: {e}") + # Fall through to permanent failure + + # Build completed more than 5 minutes ago - artifact should be visible + # This is a PERMANENT failure (artifact expired or not uploaded) message = (f"Artifact not found: '{expected_workflow}' completed successfully, " f"but no artifact was found. The artifact may have expired (GitHub deletes " f"artifacts after a retention period) or was not uploaded properly.") diff --git a/tests/test_ci/test_controllers.py b/tests/test_ci/test_controllers.py index 0470e219..f5cdafe1 100644 --- a/tests/test_ci/test_controllers.py +++ b/tests/test_ci/test_controllers.py @@ -3094,6 +3094,35 @@ def test_mark_test_failed_includes_target_url(self, mock_progress, mock_update_g self.assertEqual(len(call_args[0]), 5) # 5 positional args including target_url self.assertIn("456", call_args[0][4]) # target_url contains test ID + @mock.patch('mod_ci.controllers.update_status_on_github') + @mock.patch('mod_ci.controllers.TestProgress') + @mock.patch('mod_ci.controllers.retry_with_backoff') + def test_mark_test_failed_uses_retry_for_github(self, mock_retry, mock_progress, mock_update_github): + """Test that mark_test_failed uses retry_with_backoff for GitHub status update.""" + from mod_ci.controllers import mark_test_failed + + db = MagicMock() + test = MagicMock() + test.id = 789 + test.commit = "abc123" + test.platform.value = "linux" + + repository = MagicMock() + gh_commit = MagicMock() + repository.get_commit.return_value = gh_commit + + # Make retry_with_backoff call the function it receives + mock_retry.side_effect = lambda func, **kwargs: func() + + with mock.patch('run.log'): + mark_test_failed(db, test, repository, "Test error") + + # Verify retry_with_backoff was called with correct parameters + mock_retry.assert_called_once() + call_kwargs = mock_retry.call_args[1] + self.assertEqual(call_kwargs['max_retries'], 3) + self.assertEqual(call_kwargs['initial_backoff'], 2.0) + class TestParseGcpError(unittest.TestCase): """Tests for the parse_gcp_error helper function.""" @@ -3361,6 +3390,67 @@ def test_artifact_expired_is_not_retryable(self): self.assertIn("Artifact not found", message) self.assertIn("expired", message) + def test_recently_completed_build_is_retryable(self): + """Test that recently completed build (within grace period) returns retryable=True.""" + from datetime import datetime, timedelta, timezone + + from mod_ci.controllers import _diagnose_missing_artifact + + repository = MagicMock() + log = MagicMock() + + # Mock workflow + workflow = MagicMock() + workflow.id = 1 + workflow.name = "Build CCExtractor on Linux" + repository.get_workflows.return_value = [workflow] + + # Mock workflow run - completed successfully but very recently (1 minute ago) + workflow_run = MagicMock() + workflow_run.workflow_id = 1 + workflow_run.status = "completed" + workflow_run.conclusion = "success" + workflow_run.updated_at = datetime.now(timezone.utc) - timedelta(seconds=60) + repository.get_workflow_runs.return_value = [workflow_run] + + message, is_retryable = _diagnose_missing_artifact( + repository, "abc123", TestPlatform.linux, log + ) + + self.assertTrue(is_retryable) + self.assertIn("Build completed recently", message) + self.assertIn("Will retry", message) + + def test_old_completed_build_is_not_retryable(self): + """Test that build completed more than grace period ago returns retryable=False.""" + from datetime import datetime, timedelta, timezone + + from mod_ci.controllers import _diagnose_missing_artifact + + repository = MagicMock() + log = MagicMock() + + # Mock workflow + workflow = MagicMock() + workflow.id = 1 + workflow.name = "Build CCExtractor on Linux" + repository.get_workflows.return_value = [workflow] + + # Mock workflow run - completed 10 minutes ago (beyond grace period) + workflow_run = MagicMock() + workflow_run.workflow_id = 1 + workflow_run.status = "completed" + workflow_run.conclusion = "success" + workflow_run.updated_at = datetime.now(timezone.utc) - timedelta(seconds=600) + repository.get_workflow_runs.return_value = [workflow_run] + + message, is_retryable = _diagnose_missing_artifact( + repository, "abc123", TestPlatform.linux, log + ) + + self.assertFalse(is_retryable) + self.assertIn("Artifact not found", message) + def test_no_workflow_run_is_retryable(self): """Test that missing workflow run returns retryable=True.""" from mod_ci.controllers import _diagnose_missing_artifact