[Pytorch AutoRevert] - Improves autorevert check heuristics (#6853)

jeanschmidt · web-flow · commit 5f86d76072dc · 2025-06-30T20:33:35.000+02:00
Do some improvements in the back analisys for the revert logic with the
goal of improving precision and recall and validate as a valid strategy.

Checked against the workflows: pull trunk inductor
linux-binary-manywheel

Old code:
```
Timeframe: 720 hours
Commits checked: 6177
Auto revert patterns detected: 188
Actual reverts inside auto revert patterns detected: 24 (12.8%)
Total revert commits in period: 115
Reverts that dont match any auto revert pattern detected: 91
```

Newer code:
```
Workflow(s): pull, trunk, inductor, linux-binary-manywheel
Timeframe: 720 hours
Commits checked: 5403
Auto revert patterns detected: 442
Actual reverts inside auto revert patterns detected (precision): 48 (10.9%)
Total revert commits in period: 115
Reverts that dont match any auto revert pattern detected (recall): 67 (58.3%)
Per workflow precision:
  pull: 45 reverts out of 411 patterns (10.9%)
  trunk: 1 reverts out of 8 patterns (12.5%)
  inductor: 2 reverts out of 20 patterns (10.0%)
  linux-binary-manywheel: 0 reverts out of 3 patterns (0.0%)
```

Critical implemented changes:
* Look forward and back for the first commit that ran the failed job,
instead of trusting on always looking on the one right before or right
after.
* Job names have parts we don't care, like shards indices. As a failure
could happen in any shard we want to find any shard with the same
failure;

Things I tried and don't lead to great results:
* ignoring error classification - too low precision, not significant
increase in recall
* not requiring error repetition - too low precision, not significant
increase in recall

My take:
With a precision of 10% it justifies the cost of re-running jobs in
order to confirm redness status, even if it is not possible to test, I
suspect that the fact we force require the same output 2 times for all 3
signals, this should elevate the precision to a very high standard.
Unfortunately the only way to test is run this in shadow mode.

With a recall of 55%, it points out to being able to capture **most** of
the introduced trunk redness errors. Lots of reverts might not be caused
by ci redness, especially not in the workflows we are analyzing (could
be performance degradation, GHF/internal reasons and many others). This
number seems comfortable to provide a substantial gain in benefit for CI
quality.
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/__main__.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/__main__.py
@@ -65,9 +65,9 @@ def get_opts() -> argparse.Namespace:
     # no subcommand runs the lambda flow
     subparsers = parser.add_subparsers(dest="subcommand")
 
-    # autorevert subcommand
+    # autorevert-checker subcommand
     workflow_parser = subparsers.add_parser(
-        "autorevert", help="Analyze workflows looking for autorevert patterns"
+        "autorevert-checker", help="Analyze workflows looking for autorevert patterns"
     )
     workflow_parser.add_argument(
         "workflows",
@@ -85,9 +85,9 @@ def get_opts() -> argparse.Namespace:
         help="Show detailed output including commit summaries",
     )
 
-    # workflow-restart-checke subcommand
+    # workflow-restart-checker subcommand
     workflow_restart_parser = subparsers.add_parser(
-        "workflow-restart-checke", help="Check for restarted workflows"
+        "workflow-restart-checker", help="Check for restarted workflows"
     )
     workflow_restart_parser.add_argument(
         "workflow",
@@ -145,7 +145,7 @@ def main(*args, **kwargs) -> None:
 
     if opts.subcommand == "lambda":
         print("TODO: run lambda flow")
-    elif opts.subcommand == "workflows":
+    elif opts.subcommand == "autorevert-checker":
         autorevert_checker(opts.workflows, hours=opts.hours, verbose=opts.verbose)
     elif opts.subcommand == "workflow-restart-checker":
         workflow_restart_checker(opts.workflow, commit=opts.commit, days=opts.days)
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/autorevert_checker.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/autorevert_checker.py
@@ -7,7 +7,7 @@
 import re
 from dataclasses import dataclass
 from datetime import datetime, timedelta
-from typing import Dict, List, Optional, Set
+from typing import Dict, Iterable, List, Optional, Set, Tuple
 
 from .clickhouse_client_helper import CHCliFactory
 
@@ -47,15 +47,15 @@ def has_pending_jobs(self) -> bool:
     @property
     def job_base_names(self) -> Set[str]:
         if not hasattr(self, "_job_base_names"):
-            self._job_base_names = self._get_job_base_names()
+            self._job_base_names = self.get_job_base_names()
         return self._job_base_names
 
     def normalize_job_name(self, name: str) -> str:
         """Strip shard suffix from job name for matching."""
         # Remove patterns like ", 1, 1, " or ", 2, 3, " from job names
         return re.sub(r", \d+, \d+, ", ", ", name)
 
-    def _get_job_base_names(self) -> Set[str]:
+    def get_job_base_names(self) -> Set[str]:
         """Get normalized job names (without shard info)."""
         return {self.normalize_job_name(j.name) for j in self.jobs}
 
@@ -107,13 +107,17 @@ def _fetch_workflow_data(self):
             name,
             conclusion,
             status,
-            torchci_classification.rule as classification_rule,
-            workflow_created_at
-        FROM workflow_job FINAL
-        WHERE workflow_name IN {workflow_names:Array(String)}
-          AND head_branch = 'main'
-          AND workflow_created_at >= {lookback_time:DateTime}
-        ORDER BY workflow_name, workflow_created_at DESC, head_sha, name
+            torchci_classification.rule AS classification_rule,
+            created_at AS workflow_created_at
+        FROM
+            workflow_job FINAL
+        WHERE
+            workflow_name IN {workflow_names:Array(String)}
+            AND head_branch = 'main'
+            AND created_at >= {lookback_time:DateTime}
+            AND dynamoKey LIKE 'pytorch/pytorch/%'
+        ORDER BY
+            workflow_name, workflow_created_at DESC, head_sha, name
         """
 
         result = CHCliFactory().client.query(
@@ -194,6 +198,31 @@ def _fetch_commit_history(self):
             for row in result.result_rows
         ]
 
+    def _find_last_commit_with_job(
+        self, commits: Iterable[CommitJobs], job_name: str
+    ) -> Optional[Tuple[CommitJobs, List[JobResult]]]:
+        """
+        Find the last commit in the iterable that has a job with the specified name.
+
+        Args:
+            commits: Iterable of CommitJobs to search
+            job_name: The job name to look for
+
+        Returns:
+            The last CommitJobs object that contains the specified job, or None if not found
+        """
+        job_results = []
+        for commit in commits:
+            for job in commit.jobs:
+                if job.name.split("(")[0] == job_name:  # Normalize job name
+                    job_results.append(job)
+        if job_results:
+            return (
+                commit,
+                job_results,
+            )
+        return None, None
+
     def detect_autorevert_pattern_workflow(self, workflow_name: str) -> List[Dict]:
         """
         Detect all autorevert patterns in commit job data for a specific workflow.
@@ -215,60 +244,90 @@ def detect_autorevert_pattern_workflow(self, workflow_name: str) -> List[Dict]:
 
         patterns = []
 
-        for i in range(len(commits) - 2):
-            newer_commit1 = commits[i]  # Most recent
-            newer_commit2 = commits[i + 1]  # Second most recent
-            older_commit = commits[i + 2]  # Third most recent
-
-            # All commits must have jobs (signal)
-            if not all(c.jobs for c in [newer_commit1, newer_commit2, older_commit]):
-                continue
+        for i in range(1, len(commits) - 1):
+            suspected_commit1 = commits[i]  # The commit we want to check for failure
 
-            # Oldest commit cannot have pending jobs
-            if older_commit.has_pending_jobs:
+            if suspected_commit1.has_pending_jobs:
                 continue
 
-            # Find common failure classifications between the 2 newer commits
-            newer1_failures = {j.classification_rule for j in newer_commit1.failed_jobs}
-            newer2_failures = {j.classification_rule for j in newer_commit2.failed_jobs}
-            common_failures = newer1_failures & newer2_failures
+            suspected_failures = {
+                (
+                    j.classification_rule,
+                    j.name.split("(")[0],
+                )
+                for j in suspected_commit1.failed_jobs
+            }
+
+            common_failures = set()
+            for (
+                suspected_failure_class_rule,
+                suspected_failure_job_name,
+            ) in suspected_failures:
+                newer_commit_same_job, newer_same_jobs = (
+                    self._find_last_commit_with_job(
+                        (commits[j] for j in range(i - 1, -1, -1)),
+                        suspected_failure_job_name,
+                    )
+                )
+                if not newer_commit_same_job or not newer_same_jobs:
+                    # No older commit with the same job found
+                    continue
+
+                if any(
+                    j.classification_rule == suspected_failure_class_rule
+                    for j in newer_same_jobs
+                ):
+                    # The newer commit has the same job failing
+                    common_failures.add(
+                        (
+                            suspected_failure_class_rule,
+                            suspected_failure_job_name,
+                        )
+                    )
 
             if not common_failures:
                 continue
 
-            # Check if older commit lacks these failures but has overlapping job coverage
-            older_failures = {j.classification_rule for j in older_commit.failed_jobs}
-            older_job_names = older_commit.get_job_base_names()
-
-            for failure_rule in common_failures:
-                if failure_rule in older_failures:
-                    continue  # Older commit also has this failure
-
-                # Get job names that had this failure in newer commits
-                failed_job_names = set()
-                for commit in [newer_commit1, newer_commit2]:
-                    for job in commit.failed_jobs:
-                        if job.classification_rule == failure_rule:
-                            failed_job_names.add(commit.normalize_job_name(job.name))
-
-                # Check if older commit has overlapping job coverage
-                if failed_job_names & older_job_names:
-                    patterns.append(
-                        {
-                            "pattern_detected": True,
-                            "workflow_name": workflow_name,
-                            "failure_rule": failure_rule,
-                            "newer_commits": [
-                                newer_commit1.head_sha,
-                                newer_commit2.head_sha,
-                            ],
-                            "older_commit": older_commit.head_sha,
-                            "failed_job_names": list(failed_job_names),
-                            "older_job_coverage": list(
-                                older_job_names & failed_job_names
-                            ),
-                        }
+            for failure_rule, job_name in common_failures:
+                last_commit_with_same_job, last_same_jobs = (
+                    self._find_last_commit_with_job(
+                        (commits[j] for j in range(i + 1, len(commits))), job_name
                     )
+                )
+
+                if not last_commit_with_same_job or not last_same_jobs:
+                    # No older commit with the same job found
+                    continue
+
+                if any(
+                    j.name.split("(")[0] != job_name
+                    for j in last_commit_with_same_job.failed_jobs
+                ):
+                    # newr commit has the same job failing
+                    continue
+
+                if any(
+                    j.classification_rule == suspected_failure_class_rule
+                    for j in last_same_jobs
+                ):
+                    # The last commit with the same job has the same failure classification
+                    continue
+
+                patterns.append(
+                    {
+                        "pattern_detected": True,
+                        "workflow_name": workflow_name,
+                        "failure_rule": failure_rule,
+                        "newer_commits": [
+                            "newer_commit_same_job.head_sha",
+                            suspected_commit1.head_sha,
+                        ],
+                        "older_commit": "last_commit_with_same_job.head_sha",
+                        "failed_job_names": list("last_same_job.name"),
+                        "older_job_coverage": [],
+                    }
+                )
+                break
 
         return patterns
 
@@ -314,6 +373,20 @@ def detect_autorevert_pattern(self) -> List[Dict]:
 
         return all_patterns
 
+    def get_commits_reverted(self) -> Set[str]:
+        """
+        Get all commits that were reverted within the lookback window.
+
+        Returns:
+            List of revert information dictionaries
+        """
+        reverted_commits = set()
+        for commit in self.commit_history:
+            revert_info = self.is_commit_reverted(commit["sha"])
+            if revert_info:
+                reverted_commits.add(commit["sha"])
+        return reverted_commits
+
     def is_commit_reverted(self, target_commit_sha: str) -> Optional[Dict]:
         """
         Check if a commit was reverted within the lookback window.
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/testers/autorevert.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/testers/autorevert.py
@@ -1,5 +1,6 @@
+from collections import defaultdict
+
 from ..autorevert_checker import AutorevertPatternChecker
-from ..clickhouse_client_helper import CHCliFactory
 
 
 def autorevert_checker(
@@ -44,6 +45,8 @@ def autorevert_checker(
 
     # Detect patterns
     patterns = checker.detect_autorevert_pattern()
+    reverts = checker.get_commits_reverted()
+    not_found_reverts = reverts.copy()
 
     if patterns:
         print(
@@ -52,15 +55,14 @@ def autorevert_checker(
 
         # Create a revert checker (with extended lookback for finding reverts)
         revert_checker = AutorevertPatternChecker(
-            CHCliFactory().client, workflow_names=[], lookback_hours=hours * 2
+            workflow_names=[], lookback_hours=hours * 2
         )
 
         # Track reverts
         reverted_patterns = []
 
         for i, pattern in enumerate(patterns, 1):
-            if len(patterns) > 1:
-                print(f"\nPattern #{i}:")
+            print(f"\nPattern #{i}:")
 
             print(f"Failure rule: '{pattern['failure_rule']}'")
             print(
@@ -83,13 +85,14 @@ def autorevert_checker(
             revert_result = revert_checker.is_commit_reverted(second_commit)
 
             if revert_result:
+                not_found_reverts.discard(second_commit)
                 print(
                     f"✓ REVERTED: {second_commit[:8]} was reverted by {revert_result['revert_sha'][:8]} "
                     f"after {revert_result['hours_after_target']:.1f} hours"
                 )
                 reverted_patterns.append(pattern)
             else:
-                print(f"✗ NOT REVERTED: {second_commit[:8]} was not reverted")
+                print(f"✗ NOT REVERTED: {second_commit} was not reverted")
 
             if verbose:
                 print(f"Failed jobs ({len(pattern['failed_job_names'])}):")
@@ -121,11 +124,34 @@ def autorevert_checker(
         )
         print(f"Commits checked: {total_commits}")
 
-        print(f"Patterns detected: {len(patterns)}")
+        print(f"Auto revert patterns detected: {len(patterns)}")
+        print(
+            "Actual reverts inside auto revert patterns detected (precision): "
+            + f"{len(reverted_patterns)} ({len(reverted_patterns)/len(patterns)*100:.1f}%)"
+        )
+        print(f"Total revert commits in period: {len(reverts)}")
         print(
-            f"Actual reverts: {len(reverted_patterns)} ({len(reverted_patterns)/len(patterns)*100:.1f}%)"
+            "Reverts that dont match any auto revert pattern detected (recall): "
+            + f"{len(not_found_reverts)} ({len(not_found_reverts)/len(reverts)*100:.1f}%)"
         )
 
+        workflow_statistics = defaultdict(lambda: {"match_pattern": 0, "reverts": 0})
+        for pattern in patterns:
+            workflow_statistics[pattern["workflow_name"]]["match_pattern"] += 1
+            if pattern["newer_commits"][1] in reverts:
+                workflow_statistics[pattern["workflow_name"]]["reverts"] += 1
+
+        print("Per workflow precision:")
+        for workflow, stats in workflow_statistics.items():
+            precision = (
+                stats["reverts"] / stats["match_pattern"] * 100
+                if stats["match_pattern"] > 0
+                else 0.0
+            )
+            print(
+                f"  {workflow}: {stats['reverts']} reverts out of {stats['match_pattern']} patterns ({precision:.1f}%)"
+            )
+
         if reverted_patterns:
             print("\nReverted patterns:")
             for pattern in reverted_patterns: