Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion .github/workflows/dev_container.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,27 @@ jobs:
registry: quay.io
username: ${{ secrets.QUAY_USERNAME }}
password: ${{ secrets.QUAY_ROBOT_TOKEN }}
- name: Prepare image tag
id: tag
run: |
# Derive a safe tag: prefer head ref for PRs, otherwise ref name.
if [ "${{ github.event_name }}" = "pull_request" ]; then
raw_tag="${{ github.head_ref }}"
else
raw_tag="${{ github.ref_name }}"
fi
# Replace any characters invalid in container tags with '-'
safe_tag=$(echo "$raw_tag" | sed -E 's#[^A-Za-z0-9_.-]#-#g')
# Avoid empty tag; fallback to short SHA
if [ -z "$safe_tag" ]; then
safe_tag=${GITHUB_SHA::7}
fi
echo "safe_tag=$safe_tag" >> $GITHUB_OUTPUT
- name: Build and push
uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4
env:
QUAY_URI: quay.io/ceph-infra/teuthology-dev
QUAY_TAG: ${{ github.event_name == 'pull_request' && github.head_ref || github.ref_name }}
QUAY_TAG: ${{ steps.tag.outputs.safe_tag }}
with:
context: .
file: containers/teuthology-dev/Dockerfile
Expand Down
4 changes: 2 additions & 2 deletions teuthology/kill.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
targets = find_targets(run_name)
names = list(targets.keys())
lock_ops.unlock_safe(names, owner, run_name)
report.try_mark_run_dead(run_name)
report.try_mark_run_dead(run_name, reason="killed by user")


def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False):
Expand All @@ -93,7 +93,7 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False)
owner = job_info['owner']
if kill_processes(run_name, [job_info.get('pid')]):
return
report.try_push_job_info(job_info, dict(status="dead"))
report.try_push_job_info(job_info, dict(status="dead", failure_reason="killed by user"))
if 'machine_type' in job_info:
teuthology.exporter.JobResults().record(
machine_type=job_info["machine_type"],
Expand Down
20 changes: 14 additions & 6 deletions teuthology/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ def try_delete_job(job_id):
try_delete_job(job_id)


def try_mark_run_dead(run_name):
def try_mark_run_dead(run_name, reason=None):
"""
Using the same error checking and retry mechanism as try_push_job_info(),
mark any unfinished runs as dead.
Expand All @@ -578,18 +578,26 @@ def try_mark_run_dead(run_name):
if not reporter.base_uri:
return

log.debug("Marking run as dead: {name}".format(name=run_name))
log.debug("Marking run as dead: {name} reason={reason}".format(name=run_name, reason=reason))
jobs = reporter.get_jobs(run_name, fields=['status'])
for job in jobs:
if job['status'] not in ['pass', 'fail', 'dead']:
job_id = job['job_id']
try:
log.info("Marking job {job_id} as dead".format(job_id=job_id))
reporter.report_job(run_name, job['job_id'], dead=True)
if "machine_type" in job:
# Load existing job_info from the archive, merge in our
# extra fields so the results server gets a useful
# failure_reason when a run is marked dead manually.
job_info = reporter.serializer.job_info(run_name, job_id)
job_info.update({'status': 'dead'})
if reason:
job_info['failure_reason'] = reason

reporter.report_job(run_name, job_id, job_info=job_info)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you still need dead=True here.
Otherwise it'll be False:
def report_job(self, run_name, job_id, job_info=None, dead=False)
and once you do that job status is already updated in report_job():

 if dead and get_status(job_info) is None:
            set_status(job_info, 'dead')

I think we can stick to just updating the failure reason here

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, got it. I’ll make the suggested changes accordingly.

if "machine_type" in job_info:
teuthology.exporter.JobResults().record(
machine_type=job["machine_type"],
status=job["status"],
machine_type=job_info["machine_type"],
status=job_info["status"],
)
except report_exceptions:
log.exception("Could not mark job as dead: {job_id}".format(
Expand Down
3 changes: 1 addition & 2 deletions teuthology/task/internal/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,9 @@ def check_packages(ctx, config):
ver=package.sha1,
)
log.error(msg)
# set the failure message and update paddles with the status
ctx.summary["failure_reason"] = msg
set_status(ctx.summary, "dead")
report.try_push_job_info(ctx.config, dict(status='dead'))
report.try_push_job_info(ctx.config, dict(status='dead', failure_reason=msg))
raise VersionNotFoundError(package.base_url)
else:
log.info(
Expand Down
35 changes: 35 additions & 0 deletions teuthology/test/test_report_dead_reason.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from unittest.mock import patch, MagicMock

import teuthology.report as report


@patch('teuthology.report.ResultsReporter')
def test_try_mark_run_dead_includes_reason(mock_reporter_cls):
# Set up a fake reporter with serializer.job_info and report_job
mock_reporter = MagicMock()
mock_reporter_cls.return_value = mock_reporter

# Simulate one job returned by get_jobs
mock_reporter.get_jobs.return_value = [
{'job_id': '1', 'status': 'running'}
]

# serializer.job_info should return a dict representing archived job info
mock_reporter.serializer.job_info.return_value = {
'job_id': '1',
'machine_type': 'smithi',
}

# Call the function under test
report.try_mark_run_dead('fake-run', reason='killed by user')

# Ensure report_job was called with job_info that contains failure_reason
assert mock_reporter.report_job.called
called_args, called_kwargs = mock_reporter.report_job.call_args
# call signature: report_job(run_name, job_id, job_info=...)
assert called_args[0] == 'fake-run'
assert called_args[1] == '1'

job_info = called_kwargs.get('job_info') if 'job_info' in called_kwargs else called_args[2]
assert job_info['status'] == 'dead'
assert job_info['failure_reason'] == 'killed by user'