ceph · Annmool · Nov 3, 2025 · Nov 4, 2025 · Nov 7, 2025 · amathuria
@@ -29,11 +29,27 @@ jobs:
           registry: quay.io
           username: ${{ secrets.QUAY_USERNAME }}
           password: ${{ secrets.QUAY_ROBOT_TOKEN }}
+      - name: Prepare image tag
+        id: tag
+        run: |
+          # Derive a safe tag: prefer head ref for PRs, otherwise ref name.
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            raw_tag="${{ github.head_ref }}"
+          else
+            raw_tag="${{ github.ref_name }}"
+          fi
+          # Replace any characters invalid in container tags with '-'
+          safe_tag=$(echo "$raw_tag" | sed -E 's#[^A-Za-z0-9_.-]#-#g')
+          # Avoid empty tag; fallback to short SHA
+          if [ -z "$safe_tag" ]; then
+            safe_tag=${GITHUB_SHA::7}
+          fi
+          echo "safe_tag=$safe_tag" >> $GITHUB_OUTPUT
       - name: Build and push
         uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4
         env:
           QUAY_URI: quay.io/ceph-infra/teuthology-dev
-          QUAY_TAG: ${{ github.event_name == 'pull_request' && github.head_ref || github.ref_name }}
+          QUAY_TAG: ${{ steps.tag.outputs.safe_tag }}
         with:
           context: .
           file: containers/teuthology-dev/Dockerfile

@@ -76,7 +76,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
         targets = find_targets(run_name)
         names = list(targets.keys())
         lock_ops.unlock_safe(names, owner, run_name)
-    report.try_mark_run_dead(run_name)
+    report.try_mark_run_dead(run_name, reason="killed by user")
 
 
 def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False):
@@ -93,7 +93,7 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False)
         owner = job_info['owner']
     if kill_processes(run_name, [job_info.get('pid')]):
         return
-    report.try_push_job_info(job_info, dict(status="dead"))
+    report.try_push_job_info(job_info, dict(status="dead", failure_reason="killed by user"))
     if 'machine_type' in job_info:
         teuthology.exporter.JobResults().record(
             machine_type=job_info["machine_type"],

@@ -566,7 +566,7 @@ def try_delete_job(job_id):
         try_delete_job(job_id)
 
 
-def try_mark_run_dead(run_name):
+def try_mark_run_dead(run_name, reason=None):
     """
     Using the same error checking and retry mechanism as try_push_job_info(),
     mark any unfinished runs as dead.
@@ -578,18 +578,26 @@ def try_mark_run_dead(run_name):
     if not reporter.base_uri:
         return
 
-    log.debug("Marking run as dead: {name}".format(name=run_name))
+    log.debug("Marking run as dead: {name} reason={reason}".format(name=run_name, reason=reason))
     jobs = reporter.get_jobs(run_name, fields=['status'])
     for job in jobs:
         if job['status'] not in ['pass', 'fail', 'dead']:
             job_id = job['job_id']
             try:
                 log.info("Marking job {job_id} as dead".format(job_id=job_id))
-                reporter.report_job(run_name, job['job_id'], dead=True)
-                if "machine_type" in job:
+                # Load existing job_info from the archive, merge in our
+                # extra fields so the results server gets a useful
+                # failure_reason when a run is marked dead manually.
+                job_info = reporter.serializer.job_info(run_name, job_id)
+                job_info.update({'status': 'dead'})
+                if reason:
+                    job_info['failure_reason'] = reason
+
+                reporter.report_job(run_name, job_id, job_info=job_info)
+                if "machine_type" in job_info:
                     teuthology.exporter.JobResults().record(
-                        machine_type=job["machine_type"],
-                        status=job["status"],
+                        machine_type=job_info["machine_type"],
+                        status=job_info["status"],
                     )
             except report_exceptions:
                 log.exception("Could not mark job as dead: {job_id}".format(

@@ -109,10 +109,9 @@ def check_packages(ctx, config):
                 ver=package.sha1,
             )
             log.error(msg)
-            # set the failure message and update paddles with the status
             ctx.summary["failure_reason"] = msg
             set_status(ctx.summary, "dead")
-            report.try_push_job_info(ctx.config, dict(status='dead'))
+            report.try_push_job_info(ctx.config, dict(status='dead', failure_reason=msg))
             raise VersionNotFoundError(package.base_url)
     else:
         log.info(

@@ -0,0 +1,35 @@
+from unittest.mock import patch, MagicMock
+
+import teuthology.report as report
+
+
+@patch('teuthology.report.ResultsReporter')
+def test_try_mark_run_dead_includes_reason(mock_reporter_cls):
+    # Set up a fake reporter with serializer.job_info and report_job
+    mock_reporter = MagicMock()
+    mock_reporter_cls.return_value = mock_reporter
+
+    # Simulate one job returned by get_jobs
+    mock_reporter.get_jobs.return_value = [
+        {'job_id': '1', 'status': 'running'}
+    ]
+
+    # serializer.job_info should return a dict representing archived job info
+    mock_reporter.serializer.job_info.return_value = {
+        'job_id': '1',
+        'machine_type': 'smithi',
+    }
+
+    # Call the function under test
+    report.try_mark_run_dead('fake-run', reason='killed by user')
+
+    # Ensure report_job was called with job_info that contains failure_reason
+    assert mock_reporter.report_job.called
+    called_args, called_kwargs = mock_reporter.report_job.call_args
+    # call signature: report_job(run_name, job_id, job_info=...)
+    assert called_args[0] == 'fake-run'
+    assert called_args[1] == '1'
+
+    job_info = called_kwargs.get('job_info') if 'job_info' in called_kwargs else called_args[2]
+    assert job_info['status'] == 'dead'
+    assert job_info['failure_reason'] == 'killed by user'