Skip to content

Commit 46a680f

Browse files
[autorevert] When checking job status, only expect target job list to be finished, not waiting for all jobs to have finished (#7023)
There are some pending jobs checks that are overly cautious and could be problematic in case of queuing of some exotic type of runner. This can hold autorevert hostage if there is a long queue or an outage in some type of runner that is unrelated to the patterns we're looking for. ## Older commits (baseline) When checking for patterns, we don't really care if there are pending jobs for the older commit, as long it is unrelated to the identified pattern. ## Repeated error identification Already correctly handled, it should retry jobs at first opportunity of a single repeated breaking job. ## `confirm_commit_caused_failure_on_restarted` `has_rule` already checks for failure conclusion. So it is more correct ignore if jobs are still pending and match in the first confirmation. But it is important to only check for the relevant job names if they finished in base, as any newly job could finish as failure, what should avoid reverting. --------- Co-authored-by: Ivan Zaitsev <[email protected]>
1 parent 47f28a3 commit 46a680f

File tree

2 files changed

+21
-18
lines changed

2 files changed

+21
-18
lines changed

aws/lambda/pytorch-auto-revert/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ run-local-dry: venv/bin/python
2727

2828
.PHONY: run-local-workflows
2929
run-local-workflows: venv/bin/python
30-
venv/bin/python -m pytorch_auto_revert autorevert-checker Lint trunk pull inductor linux-binary-manywheel --hours 4380 --ignore-common-errors
30+
venv/bin/python -m pytorch_auto_revert autorevert-checker Lint trunk pull inductor linux-binary-manywheel --hours 4380 --ignore-common-errors --verbose
3131

3232
deployment.zip:
3333
mkdir -p deployment

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/autorevert_checker.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ def has_pending_jobs(self) -> bool:
4949
def job_base_names(self) -> Set[str]:
5050
return self.get_job_base_names()
5151

52+
def jobs_with_base_name(self, job_base_name: str) -> List[JobResult]:
53+
"""Get all jobs with a specific normalized base name."""
54+
return [
55+
j for j in self.jobs if self.normalize_job_name(j.name) == job_base_name
56+
]
57+
5258
def normalize_job_name(self, name: str) -> str:
5359
"""Normalize job name to a stable base for matching across commits.
5460
@@ -272,9 +278,6 @@ def detect_autorevert_pattern_workflow(self, workflow_name: str) -> List[Dict]:
272278
for i in range(1, len(commits) - 1):
273279
suspected_commit1 = commits[i]
274280

275-
if suspected_commit1.has_pending_jobs:
276-
continue
277-
278281
# Extract unique (classification_rule, normalized job) pairs for failing jobs on the suspected commit
279282
suspected_failures = {
280283
(
@@ -340,8 +343,8 @@ def detect_autorevert_pattern_workflow(self, workflow_name: str) -> List[Dict]:
340343
# No older commit with same normalized job name found
341344
continue
342345

343-
# Ensure the oldest commit has stable signal (no running jobs)
344-
if last_commit_with_same_job.has_pending_jobs:
346+
# Ensure the oldest commit has stable signal for the jobs we care about (no running jobs)
347+
if any(j.status != "completed" for j in last_same_jobs):
345348
continue
346349

347350
if any(
@@ -503,30 +506,30 @@ def confirm_commit_caused_failure_on_restarted(self, pattern: Dict) -> bool:
503506
previous_commit = pattern["older_commit"]
504507

505508
# Fetch restarted jobs for first failing and previous commits
506-
failing_jobs = self._fetch_single_commit_jobs(
509+
failing_commit_jobs = self._fetch_single_commit_jobs(
507510
workflow_name, first_failing, restarted_only=True
508511
)
509-
prev_jobs = self._fetch_single_commit_jobs(
512+
prev_commit_jobs = self._fetch_single_commit_jobs(
510513
workflow_name, previous_commit, restarted_only=True
511514
)
512-
if not failing_jobs or not prev_jobs:
515+
if not failing_commit_jobs or not prev_commit_jobs:
513516
return False
514517

515-
# Pending check
516-
if failing_jobs.has_pending_jobs or prev_jobs.has_pending_jobs:
518+
failing_suspected_jobs = failing_commit_jobs.jobs_with_base_name(job_base)
519+
prev_suspected_jobs = prev_commit_jobs.jobs_with_base_name(job_base)
520+
if any(j.status != "completed" for j in prev_suspected_jobs):
521+
# Previous commit has pending jobs, cannot confirm
517522
return False
518523

519-
def has_rule(cj: CommitJobs, rule: str) -> bool:
524+
def has_rule(jobs: Iterable[JobResult], rule: str) -> bool:
520525
return any(
521-
cj.normalize_job_name(j.name) == job_base
522-
and j.classification_rule == rule
523-
and j.conclusion == "failure"
524-
for j in cj.jobs
526+
j.classification_rule == rule and j.conclusion == "failure"
527+
for j in jobs
525528
)
526529

527530
# Commit-caused if failing commit reproduces, previous does not
528-
return has_rule(failing_jobs, failure_rule) and not has_rule(
529-
prev_jobs, failure_rule
531+
return has_rule(failing_suspected_jobs, failure_rule) and not has_rule(
532+
prev_suspected_jobs, failure_rule
530533
)
531534

532535
def _get_commits_reverted(self) -> Set[str]:

0 commit comments

Comments
 (0)