Skip to content

Commit 4eb9d0c

Browse files
[autorevert] Fix bug on job retrival for autorevert lambda (#6979)
After fixing the bug, the new statistics are: ``` ================================================== SUMMARY STATISTICS ================================================== Workflow(s): Lint, trunk, pull, inductor, linux-binary-manywheel Timeframe: 4320 hours Commits checked: 33519 Auto revert patterns detected: 1345 Actual reverts inside auto revert patterns detected (precision): 219 (16.3%) Total revert commits in period: 585 Revert categories: nosignal: 202 (34.5%) ghfirst: 156 (26.7%) uncategorized: 104 (17.8%) ignoredsignal: 68 (11.6%) weird: 45 (7.7%) landrace: 10 (1.7%) Total reverts excluding ghfirst: 429 Reverts (excluding ghfirst) that dont match any auto revert pattern detected (recall): 250 (58.3%) Per workflow precision: Lint: 45 reverts out of 75 patterns (60.0%) [excluding ghfirst: 41 (54.7%)] trunk: 30 reverts out of 136 patterns (22.1%) [excluding ghfirst: 28 (20.6%)] pull: 104 reverts out of 859 patterns (12.1%) [excluding ghfirst: 92 (10.7%)] inductor: 39 reverts out of 269 patterns (14.5%) [excluding ghfirst: 38 (14.1%)] linux-binary-manywheel: 1 reverts out of 6 patterns (16.7%) [excluding ghfirst: 0 (0.0%)] ``` The main bug is that when checking for commits before/after with the same job, it actually concatenated all commits+jobs before and after, instead of only returning the next one. I added also the ergonomics for lambda invocation --------- Co-authored-by: Ivan Zaitsev <[email protected]>
1 parent 2148043 commit 4eb9d0c

File tree

4 files changed

+57
-27
lines changed

4 files changed

+57
-27
lines changed

aws/lambda/pytorch-auto-revert/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ run-local: venv/bin/python
2323

2424
.PHONY: run-local-workflows
2525
run-local-workflows: venv/bin/python
26-
venv/bin/python -m pytorch_auto_revert workflows pull.yml
26+
venv/bin/python -m pytorch_auto_revert autorevert-checker Lint trunk pull inductor linux-binary-manywheel --hours 4320 --ignore-common-errors
2727

2828
deployment.zip:
2929
mkdir -p deployment

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/__main__.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,11 @@ def get_opts() -> argparse.Namespace:
9595
action="store_true",
9696
help="Show what would be restarted without actually doing it (use with --do-restart)",
9797
)
98+
workflow_parser.add_argument(
99+
"--ignore-common-errors",
100+
action="store_true",
101+
help="Ignore common errors in autorevert patterns (e.g., 'No tests found')",
102+
)
98103

99104
# workflow-restart-checker subcommand
100105
workflow_restart_parser = subparsers.add_parser(
@@ -159,15 +164,29 @@ def main(*args, **kwargs) -> None:
159164
"ClickHouse connection test failed. Please check your configuration."
160165
)
161166

162-
if opts.subcommand == "lambda":
163-
print("TODO: run lambda flow")
167+
if opts.subcommand is None:
168+
autorevert_checker(
169+
[
170+
"Lint",
171+
"trunk",
172+
"pull",
173+
"inductor",
174+
"linux-binary-manywheel",
175+
],
176+
hours=2,
177+
verbose=True,
178+
do_restart=True,
179+
dry_run=False,
180+
ignore_common_errors=True,
181+
)
164182
elif opts.subcommand == "autorevert-checker":
165183
autorevert_checker(
166184
opts.workflows,
167185
hours=opts.hours,
168186
verbose=opts.verbose,
169187
do_restart=opts.do_restart,
170188
dry_run=opts.dry_run,
189+
ignore_common_errors=opts.ignore_common_errors,
171190
)
172191
elif opts.subcommand == "workflow-restart-checker":
173192
workflow_restart_checker(opts.workflow, commit=opts.commit, days=opts.days)

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/autorevert_checker.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,20 @@ def get_job_base_names(self) -> Set[str]:
6363
class AutorevertPatternChecker:
6464
"""Detects autorevert patterns in workflow job failures."""
6565

66-
def __init__(self, workflow_names: List[str] = None, lookback_hours: int = 48):
66+
def __init__(
67+
self,
68+
workflow_names: List[str] = None,
69+
lookback_hours: int = 48,
70+
ignore_classification_rules: Set[str] = None,
71+
):
6772
self.workflow_names = workflow_names or []
6873
self.lookback_hours = lookback_hours
6974
self._workflow_commits_cache: Dict[str, List[CommitJobs]] = {}
7075
self._commit_history = None
76+
self._ignore_classification_rules = ignore_classification_rules or set()
7177

7278
def get_workflow_commits(self, workflow_name: str) -> List[CommitJobs]:
73-
"""Get workflow commits for a specific workflow, fetching if needed."""
79+
"""Get workflow commits for a specific workflow, fetching if needed. From newer to older"""
7480
if workflow_name not in self._workflow_commits_cache:
7581
self._fetch_workflow_data()
7682
return self._workflow_commits_cache.get(workflow_name, [])
@@ -90,7 +96,7 @@ def commit_history(self) -> List[Dict]:
9096
return self._commit_history or []
9197

9298
def _fetch_workflow_data(self):
93-
"""Fetch workflow job data from ClickHouse for all workflows in batch."""
99+
"""Fetch workflow job data from ClickHouse for all workflows in batch. From newer to older"""
94100
if not self.workflow_names:
95101
return
96102

@@ -203,25 +209,25 @@ def _find_last_commit_with_job(
203209
self, commits: Iterable[CommitJobs], job_name: str
204210
) -> Optional[Tuple[CommitJobs, List[JobResult]]]:
205211
"""
206-
Find the last commit in the iterable that has a job with the specified name.
212+
Find the first commit (in the provided iteration order) that has a job with the specified name.
207213
208214
Args:
209215
commits: Iterable of CommitJobs to search
210216
job_name: The job name to look for
211217
212218
Returns:
213-
The last CommitJobs object that contains the specified job, or None if not found
219+
The first CommitJobs object (per the iterable's order) that contains the specified job, or None if not found
214220
"""
215221
job_results = []
216222
for commit in commits:
217223
for job in commit.jobs:
218224
if job.name.split("(")[0] == job_name: # Normalize job name
219225
job_results.append(job)
220-
if job_results:
221-
return (
222-
commit,
223-
job_results,
224-
)
226+
if job_results:
227+
return (
228+
commit,
229+
job_results,
230+
)
225231
return None, None
226232

227233
def detect_autorevert_pattern_workflow(self, workflow_name: str) -> List[Dict]:
@@ -266,6 +272,10 @@ def detect_autorevert_pattern_workflow(self, workflow_name: str) -> List[Dict]:
266272
suspected_failure_class_rule,
267273
suspected_failure_job_name,
268274
) in suspected_failures:
275+
if suspected_failure_class_rule in self._ignore_classification_rules:
276+
# Skip ignored classification rules
277+
continue
278+
269279
newer_commit_same_job, newer_same_jobs = (
270280
self._find_last_commit_with_job(
271281
(commits[j] for j in range(i - 1, -1, -1)),
@@ -304,18 +314,8 @@ def detect_autorevert_pattern_workflow(self, workflow_name: str) -> List[Dict]:
304314
# No older commit with the same job found
305315
continue
306316

307-
if any(
308-
j.name.split("(")[0] != job_name
309-
for j in last_commit_with_same_job.failed_jobs
310-
):
311-
# newr commit has the same job failing
312-
continue
313-
314-
if any(
315-
j.classification_rule == suspected_failure_class_rule
316-
for j in last_same_jobs
317-
):
318-
# The last commit with the same job has the same failure classification
317+
if any(j.classification_rule == failure_rule for j in last_same_jobs):
318+
# The older commit has the same job failing with same rule
319319
continue
320320

321321
patterns.append(

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/testers/autorevert.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,20 @@ def autorevert_checker(
1010
verbose: bool = False,
1111
do_restart: bool = False,
1212
dry_run: bool = False,
13+
ignore_common_errors=True,
1314
):
15+
common_errors = {
16+
"GHA error",
17+
"GHA timeout",
18+
"sccache error",
19+
}
20+
1421
# Initialize checker
15-
checker = AutorevertPatternChecker(workflow_names, hours)
22+
checker = AutorevertPatternChecker(
23+
workflow_names,
24+
hours,
25+
ignore_classification_rules=common_errors if ignore_common_errors else set(),
26+
)
1627

1728
# Fetch data
1829
if verbose:
@@ -108,7 +119,7 @@ def autorevert_checker(
108119
"category", "uncategorized"
109120
)
110121
print(
111-
f"✓ REVERTED ({category}): {second_commit[:8]} was reverted by {revert_result['revert_sha'][:8]} "
122+
f"✓ REVERTED ({category}): {second_commit} was reverted by {revert_result['revert_sha'][:8]} "
112123
f"after {revert_result['hours_after_target']:.1f} hours"
113124
)
114125
reverted_patterns.append(pattern)

0 commit comments

Comments
 (0)