Skip to content

Commit 54e6087

Browse files
authored
[ISD-3751] Fix: rejecting without retry for reactive job not found. (#581)
1 parent 34472c9 commit 54e6087

File tree

4 files changed

+64
-15
lines changed

4 files changed

+64
-15
lines changed

docs/changelog.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22

33
This changelog documents user-relevant changes to the GitHub runner charm.
44

5+
## 2025-06-24
56

6-
### 2025-06-17
7+
- Fix a bug where deleted GitHub Actions Job would cause an endless loop of retries.
78

8-
- Fix bug where SSH connection error always appears in the logs.
9+
### 2025-06-17
910

11+
- Fix a bug where SSH connection error always appears in the logs.
1012

1113
### 2025-06-16
1214

github-runner-manager/src/github_runner_manager/github_client.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -342,14 +342,22 @@ def get_job_info(self, path: GitHubRepo, job_id: str) -> JobInfo:
342342
path: GitHub repository path in the format '<owner>/<repo>'.
343343
job_id: The job id.
344344
345+
Raises:
346+
JobNotFoundError: Cannot find job on GitHub.
347+
345348
Returns:
346349
The JSON response from the API.
347350
"""
348-
job_raw = self._client.actions.get_job_for_workflow_run(
349-
owner=path.owner,
350-
repo=path.repo,
351-
job_id=job_id,
352-
)
351+
try:
352+
job_raw = self._client.actions.get_job_for_workflow_run(
353+
owner=path.owner,
354+
repo=path.repo,
355+
job_id=job_id,
356+
)
357+
except HTTPError as exc:
358+
if exc.code == 404:
359+
raise JobNotFoundError(f"Could not find job for job id {job_id}.") from exc
360+
raise
353361
return self._to_job_info(job_raw)
354362

355363
@staticmethod

github-runner-manager/src/github_runner_manager/reactive/consumer.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
from github_runner_manager.manager.models import RunnerMetadata
2222
from github_runner_manager.manager.runner_manager import RunnerManager
23-
from github_runner_manager.platform.platform_provider import PlatformProvider
23+
from github_runner_manager.platform.platform_provider import JobNotFoundError, PlatformProvider
2424
from github_runner_manager.reactive.types_ import QueueConfig
2525

2626
logger = logging.getLogger(__name__)
@@ -90,7 +90,8 @@ def get_queue_size(queue_config: QueueConfig) -> int:
9090
raise QueueError("Error when communicating with the queue") from exc
9191

9292

93-
def consume(
93+
# Ignore `consume` too complex as it is pending re-design.
94+
def consume( # noqa: C901
9495
queue_config: QueueConfig,
9596
runner_manager: RunnerManager,
9697
platform_provider: PlatformProvider,
@@ -145,12 +146,18 @@ def consume(
145146
except ValueError:
146147
msg.reject(requeue=False)
147148
break
148-
if platform_provider.check_job_been_picked_up(
149-
metadata=metadata, job_url=job_details.url
150-
):
151-
logger.info("reactive job: %s already picked up.", job_details)
152-
msg.ack()
153-
continue
149+
try:
150+
if platform_provider.check_job_been_picked_up(
151+
metadata=metadata, job_url=job_details.url
152+
):
153+
logger.info("reactive job: %s already picked up.", job_details)
154+
msg.ack()
155+
continue
156+
except JobNotFoundError:
157+
logger.warning(
158+
"Unable to find the job %s. Not retrying this job.", job_details.url
159+
)
160+
msg.reject(requeue=False)
154161
_spawn_runner(
155162
runner_manager=runner_manager,
156163
job_url=job_details.url,

tests/integration/test_reactive.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import pytest_asyncio
1111
from github import Branch, Repository
1212
from github_runner_manager.manager.cloud_runner_manager import PostJobStatus
13+
from github_runner_manager.reactive.consumer import JobDetails
1314
from juju.application import Application
1415
from pytest_operator.plugin import OpsTest
1516

@@ -136,6 +137,37 @@ async def _runner_installed_in_metrics_log() -> bool:
136137
await _assert_metrics_are_logged(app, github_repository)
137138

138139

140+
@pytest.mark.abort_on_fail
141+
async def test_reactive_mode_with_not_found_job(
142+
ops_test: OpsTest,
143+
app: Application,
144+
):
145+
"""
146+
arrange: Place a message in the queue with a non-existent job url.
147+
act: Call reconcile.
148+
assert: The message is removed from the queue.
149+
"""
150+
mongodb_uri = await get_mongodb_uri(ops_test, app)
151+
152+
labels = {app.name, "x64"} # The architecture label should be ignored in the
153+
# label validation in the reactive consumer.
154+
job = JobDetails(
155+
labels=labels,
156+
url="https://github.com/canonical/github-runner-operator/actions/runs/mock-run/job/mock-job",
157+
)
158+
add_to_queue(
159+
json.dumps(json.loads(job.json()) | {"ignored_noise": "foobar"}),
160+
mongodb_uri,
161+
app.name,
162+
)
163+
164+
# There may be a race condition between getting the token and spawning the machine.
165+
await sleep(10)
166+
await wait_for_reconcile(app)
167+
168+
assert_queue_is_empty(mongodb_uri, app.name)
169+
170+
139171
@pytest.mark.abort_on_fail
140172
async def test_reactive_mode_does_not_consume_jobs_with_unsupported_labels(
141173
ops_test: OpsTest,

0 commit comments

Comments
 (0)