From 26c44ec1451b81e865e3d41df3ff5006dde02451 Mon Sep 17 00:00:00 2001 From: Annmool Date: Mon, 3 Nov 2025 11:56:06 +0530 Subject: [PATCH 1/3] report: include failure_reason for manual kills and marking run dead; add test --- teuthology/kill.py | 6 ++-- teuthology/report.py | 23 ++++++++++---- teuthology/task/internal/__init__.py | 4 ++- teuthology/test/test_report_dead_reason.py | 36 ++++++++++++++++++++++ 4 files changed, 60 insertions(+), 9 deletions(-) create mode 100644 teuthology/test/test_report_dead_reason.py diff --git a/teuthology/kill.py b/teuthology/kill.py index 137e49080e..152560b314 100755 --- a/teuthology/kill.py +++ b/teuthology/kill.py @@ -76,7 +76,8 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None, targets = find_targets(run_name) names = list(targets.keys()) lock_ops.unlock_safe(names, owner, run_name) - report.try_mark_run_dead(run_name) + # Provide a reason so individual jobs get a failure_reason when marked dead + report.try_mark_run_dead(run_name, reason="killed by user") def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False): @@ -93,7 +94,8 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False) owner = job_info['owner'] if kill_processes(run_name, [job_info.get('pid')]): return - report.try_push_job_info(job_info, dict(status="dead")) + # Indicate this was a manual kill so the results server can show a reason + report.try_push_job_info(job_info, dict(status="dead", failure_reason="killed by user")) if 'machine_type' in job_info: teuthology.exporter.JobResults().record( machine_type=job_info["machine_type"], diff --git a/teuthology/report.py b/teuthology/report.py index f0a4472017..111dce513b 100644 --- a/teuthology/report.py +++ b/teuthology/report.py @@ -566,7 +566,7 @@ def try_delete_job(job_id): try_delete_job(job_id) -def try_mark_run_dead(run_name): +def try_mark_run_dead(run_name, reason=None): """ Using the same error checking and retry mechanism as try_push_job_info(), mark any unfinished runs as dead. @@ -578,18 +578,29 @@ def try_mark_run_dead(run_name): if not reporter.base_uri: return - log.debug("Marking run as dead: {name}".format(name=run_name)) + log.debug("Marking run as dead: {name} reason={reason}".format(name=run_name, reason=reason)) jobs = reporter.get_jobs(run_name, fields=['status']) for job in jobs: if job['status'] not in ['pass', 'fail', 'dead']: job_id = job['job_id'] try: log.info("Marking job {job_id} as dead".format(job_id=job_id)) - reporter.report_job(run_name, job['job_id'], dead=True) - if "machine_type" in job: + # Load existing job_info from the archive, merge in our + # extra fields so the results server gets a useful + # failure_reason when a run is marked dead manually. + job_info = reporter.serializer.job_info(run_name, job_id) + # Ensure status is set to dead and include a reason if given + job_info.update({'status': 'dead'}) + if reason: + # Use the common 'failure_reason' field elsewhere in the + # codebase so tooling can pick it up. + job_info['failure_reason'] = reason + + reporter.report_job(run_name, job_id, job_info=job_info) + if "machine_type" in job_info: teuthology.exporter.JobResults().record( - machine_type=job["machine_type"], - status=job["status"], + machine_type=job_info["machine_type"], + status=job_info["status"], ) except report_exceptions: log.exception("Could not mark job as dead: {job_id}".format( diff --git a/teuthology/task/internal/__init__.py b/teuthology/task/internal/__init__.py index 15b8f81f54..4472fed9b4 100644 --- a/teuthology/task/internal/__init__.py +++ b/teuthology/task/internal/__init__.py @@ -112,7 +112,9 @@ def check_packages(ctx, config): # set the failure message and update paddles with the status ctx.summary["failure_reason"] = msg set_status(ctx.summary, "dead") - report.try_push_job_info(ctx.config, dict(status='dead')) + # Include the failure_reason so the results server records why + # this job was marked dead (e.g. missing packages) + report.try_push_job_info(ctx.config, dict(status='dead', failure_reason=msg)) raise VersionNotFoundError(package.base_url) else: log.info( diff --git a/teuthology/test/test_report_dead_reason.py b/teuthology/test/test_report_dead_reason.py new file mode 100644 index 0000000000..8bb8426c8f --- /dev/null +++ b/teuthology/test/test_report_dead_reason.py @@ -0,0 +1,36 @@ +import pytest +from unittest.mock import patch, MagicMock + +import teuthology.report as report + + +@patch('teuthology.report.ResultsReporter') +def test_try_mark_run_dead_includes_reason(mock_reporter_cls): + # Set up a fake reporter with serializer.job_info and report_job + mock_reporter = MagicMock() + mock_reporter_cls.return_value = mock_reporter + + # Simulate one job returned by get_jobs + mock_reporter.get_jobs.return_value = [ + {'job_id': '1', 'status': 'running'} + ] + + # serializer.job_info should return a dict representing archived job info + mock_reporter.serializer.job_info.return_value = { + 'job_id': '1', + 'machine_type': 'smithi', + } + + # Call the function under test + report.try_mark_run_dead('fake-run', reason='killed by user') + + # Ensure report_job was called with job_info that contains failure_reason + assert mock_reporter.report_job.called + called_args, called_kwargs = mock_reporter.report_job.call_args + # call signature: report_job(run_name, job_id, job_info=...) + assert called_args[0] == 'fake-run' + assert called_args[1] == '1' + + job_info = called_kwargs.get('job_info') if 'job_info' in called_kwargs else called_args[2] + assert job_info['status'] == 'dead' + assert job_info['failure_reason'] == 'killed by user' From 4e2e59d6f78f0855ef7b64cb0431c90cb840e095 Mon Sep 17 00:00:00 2001 From: Annmool Date: Tue, 4 Nov 2025 10:20:16 +0530 Subject: [PATCH 2/3] ci: sanitize docker image tag; fix flake8 unused import in test --- .github/workflows/dev_container.yml | 18 +++++++++++++++++- teuthology/test/test_report_dead_reason.py | 1 - 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dev_container.yml b/.github/workflows/dev_container.yml index 8540a2e5be..4f9f0c425b 100644 --- a/.github/workflows/dev_container.yml +++ b/.github/workflows/dev_container.yml @@ -29,11 +29,27 @@ jobs: registry: quay.io username: ${{ secrets.QUAY_USERNAME }} password: ${{ secrets.QUAY_ROBOT_TOKEN }} + - name: Prepare image tag + id: tag + run: | + # Derive a safe tag: prefer head ref for PRs, otherwise ref name. + if [ "${{ github.event_name }}" = "pull_request" ]; then + raw_tag="${{ github.head_ref }}" + else + raw_tag="${{ github.ref_name }}" + fi + # Replace any characters invalid in container tags with '-' + safe_tag=$(echo "$raw_tag" | sed -E 's#[^A-Za-z0-9_.-]#-#g') + # Avoid empty tag; fallback to short SHA + if [ -z "$safe_tag" ]; then + safe_tag=${GITHUB_SHA::7} + fi + echo "safe_tag=$safe_tag" >> $GITHUB_OUTPUT - name: Build and push uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 env: QUAY_URI: quay.io/ceph-infra/teuthology-dev - QUAY_TAG: ${{ github.event_name == 'pull_request' && github.head_ref || github.ref_name }} + QUAY_TAG: ${{ steps.tag.outputs.safe_tag }} with: context: . file: containers/teuthology-dev/Dockerfile diff --git a/teuthology/test/test_report_dead_reason.py b/teuthology/test/test_report_dead_reason.py index 8bb8426c8f..aee230198f 100644 --- a/teuthology/test/test_report_dead_reason.py +++ b/teuthology/test/test_report_dead_reason.py @@ -1,4 +1,3 @@ -import pytest from unittest.mock import patch, MagicMock import teuthology.report as report From f86da1ed6a7b6f6f49d1bcd0055c7b84f22c0e48 Mon Sep 17 00:00:00 2001 From: Annmool Date: Fri, 7 Nov 2025 18:59:42 +0530 Subject: [PATCH 3/3] chore: remove explanatory comments from modified reporting paths --- teuthology/kill.py | 2 -- teuthology/report.py | 3 --- teuthology/task/internal/__init__.py | 3 --- 3 files changed, 8 deletions(-) diff --git a/teuthology/kill.py b/teuthology/kill.py index 152560b314..29a664765d 100755 --- a/teuthology/kill.py +++ b/teuthology/kill.py @@ -76,7 +76,6 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None, targets = find_targets(run_name) names = list(targets.keys()) lock_ops.unlock_safe(names, owner, run_name) - # Provide a reason so individual jobs get a failure_reason when marked dead report.try_mark_run_dead(run_name, reason="killed by user") @@ -94,7 +93,6 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False) owner = job_info['owner'] if kill_processes(run_name, [job_info.get('pid')]): return - # Indicate this was a manual kill so the results server can show a reason report.try_push_job_info(job_info, dict(status="dead", failure_reason="killed by user")) if 'machine_type' in job_info: teuthology.exporter.JobResults().record( diff --git a/teuthology/report.py b/teuthology/report.py index 111dce513b..3f0b3b089f 100644 --- a/teuthology/report.py +++ b/teuthology/report.py @@ -589,11 +589,8 @@ def try_mark_run_dead(run_name, reason=None): # extra fields so the results server gets a useful # failure_reason when a run is marked dead manually. job_info = reporter.serializer.job_info(run_name, job_id) - # Ensure status is set to dead and include a reason if given job_info.update({'status': 'dead'}) if reason: - # Use the common 'failure_reason' field elsewhere in the - # codebase so tooling can pick it up. job_info['failure_reason'] = reason reporter.report_job(run_name, job_id, job_info=job_info) diff --git a/teuthology/task/internal/__init__.py b/teuthology/task/internal/__init__.py index 4472fed9b4..a198281550 100644 --- a/teuthology/task/internal/__init__.py +++ b/teuthology/task/internal/__init__.py @@ -109,11 +109,8 @@ def check_packages(ctx, config): ver=package.sha1, ) log.error(msg) - # set the failure message and update paddles with the status ctx.summary["failure_reason"] = msg set_status(ctx.summary, "dead") - # Include the failure_reason so the results server records why - # this job was marked dead (e.g. missing packages) report.try_push_job_info(ctx.config, dict(status='dead', failure_reason=msg)) raise VersionNotFoundError(package.base_url) else: