diff --git a/aws/lambda/pytorch-auto-revert/Makefile b/aws/lambda/pytorch-auto-revert/Makefile index 556b3c99dd..12327ad3bf 100644 --- a/aws/lambda/pytorch-auto-revert/Makefile +++ b/aws/lambda/pytorch-auto-revert/Makefile @@ -21,9 +21,13 @@ venv/bin/lintrunner: venv/bin/python run-local: venv/bin/python venv/bin/python -m pytorch_auto_revert +.PHONY: run-local-dry +run-local-dry: venv/bin/python + venv/bin/python -m pytorch_auto_revert --dry-run + .PHONY: run-local-workflows run-local-workflows: venv/bin/python - venv/bin/python -m pytorch_auto_revert autorevert-checker Lint trunk pull inductor linux-binary-manywheel --hours 4320 --ignore-common-errors + venv/bin/python -m pytorch_auto_revert autorevert-checker Lint trunk pull inductor linux-binary-manywheel --hours 4380 --ignore-common-errors deployment.zip: mkdir -p deployment diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/__main__.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/__main__.py index 8ab2ca6954..43e6216172 100644 --- a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/__main__.py +++ b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/__main__.py @@ -62,6 +62,11 @@ def get_opts() -> argparse.Namespace: type=int, default=int(os.environ.get("GITHUB_INSTALLATION_ID", "0")), ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be restarted without actually doing it (use with --do-restart)", + ) # no subcommand runs the lambda flow subparsers = parser.add_subparsers(dest="subcommand") @@ -91,9 +96,9 @@ def get_opts() -> argparse.Namespace: help="Actually restart workflows for detected autorevert patterns", ) workflow_parser.add_argument( - "--dry-run", + "--do-revert", action="store_true", - help="Show what would be restarted without actually doing it (use with --do-restart)", + help="When restarts complete and secondary pattern matches, log REVERT", ) workflow_parser.add_argument( "--ignore-common-errors", @@ -173,18 +178,20 @@ def main(*args, **kwargs) -> None: "inductor", "linux-binary-manywheel", ], + do_restart=True, + do_revert=False, hours=2, verbose=True, - do_restart=True, - dry_run=False, + dry_run=opts.dry_run, ignore_common_errors=True, ) elif opts.subcommand == "autorevert-checker": autorevert_checker( opts.workflows, + do_restart=opts.do_restart, + do_revert=opts.do_revert, hours=opts.hours, verbose=opts.verbose, - do_restart=opts.do_restart, dry_run=opts.dry_run, ignore_common_errors=opts.ignore_common_errors, ) diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/autorevert_checker.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/autorevert_checker.py index 6378de3df6..f44e19b392 100644 --- a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/autorevert_checker.py +++ b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/autorevert_checker.py @@ -9,6 +9,8 @@ from datetime import datetime, timedelta from typing import Dict, Iterable, List, Optional, Set, Tuple +from lazyproperty import lazyproperty + from .clickhouse_client_helper import CHCliFactory @@ -51,9 +53,19 @@ def job_base_names(self) -> Set[str]: return self._job_base_names def normalize_job_name(self, name: str) -> str: - """Strip shard suffix from job name for matching.""" + """Normalize job name to a stable base for matching across commits. + + - Drop any trailing parenthetical qualifiers (e.g., "(rocm)", shard notes) + - Strip common shard suffixes like ", 1, 1, " used in CI naming + - Collapse redundant whitespace + """ + # Drop any trailing parenthetical qualifier + base = re.sub(r"\s*\(.*\)$", "", name) # Remove patterns like ", 1, 1, " or ", 2, 3, " from job names - return re.sub(r", \d+, \d+, ", ", ", name) + base = re.sub(r", \d+, \d+, ", ", ", base) + # Collapse multiple spaces + base = re.sub(r"\s+", " ", base).strip() + return base def get_job_base_names(self) -> Set[str]: """Get normalized job names (without shard info).""" @@ -72,7 +84,6 @@ def __init__( self.workflow_names = workflow_names or [] self.lookback_hours = lookback_hours self._workflow_commits_cache: Dict[str, List[CommitJobs]] = {} - self._commit_history = None self._ignore_classification_rules = ignore_classification_rules or set() def get_workflow_commits(self, workflow_name: str) -> List[CommitJobs]: @@ -81,19 +92,25 @@ def get_workflow_commits(self, workflow_name: str) -> List[CommitJobs]: self._fetch_workflow_data() return self._workflow_commits_cache.get(workflow_name, []) - @property + @lazyproperty() def workflow_commits(self) -> List[CommitJobs]: """Get workflow commits for the first workflow (backward compatibility).""" if self.workflow_names: return self.get_workflow_commits(self.workflow_names[0]) return [] - @property + @lazyproperty() def commit_history(self) -> List[Dict]: """Get commit history, fetching if needed.""" - if self._commit_history is None: - self._fetch_commit_history() - return self._commit_history or [] + return self._fetch_commit_history() + + @lazyproperty() + def commits_reverted(self) -> Set[str]: + return self._get_commits_reverted() + + @lazyproperty() + def commits_reverted_with_info(self) -> Dict[str, Dict]: + return self._get_commits_reverted_with_info() def _fetch_workflow_data(self): """Fetch workflow job data from ClickHouse for all workflows in batch. From newer to older""" @@ -106,7 +123,10 @@ def _fetch_workflow_data(self): f"Fetching workflow data for {len(self.workflow_names)} workflows since {lookback_time.isoformat()}..." ) - query = """ + # For pattern detection we consider non-restarted main branch jobs only + base_where = "workflow_event != 'workflow_dispatch' AND head_branch = 'main'" + + query = f""" SELECT workflow_name, head_sha, @@ -118,11 +138,10 @@ def _fetch_workflow_data(self): FROM workflow_job FINAL WHERE - workflow_name IN {workflow_names:Array(String)} - AND head_branch = 'main' - AND created_at >= {lookback_time:DateTime} + workflow_name IN {{workflow_names:Array(String)}} + AND {base_where} + AND created_at >= {{lookback_time:DateTime}} AND dynamoKey LIKE 'pytorch/pytorch/%' - AND workflow_event != 'workflow_dispatch' -- Exclude restart jobs ORDER BY workflow_name, workflow_created_at DESC, head_sha, name """ @@ -200,7 +219,7 @@ def _fetch_commit_history(self): query, parameters={"lookback_time": lookback_time} ) - self._commit_history = [ + return [ {"sha": row[0], "message": row[1], "timestamp": row[2]} for row in result.result_rows ] @@ -221,7 +240,7 @@ def _find_last_commit_with_job( job_results = [] for commit in commits: for job in commit.jobs: - if job.name.split("(")[0] == job_name: # Normalize job name + if commit.normalize_job_name(job.name) == job_name: job_results.append(job) if job_results: return ( @@ -245,27 +264,31 @@ def detect_autorevert_pattern_workflow(self, workflow_name: str) -> List[Dict]: Returns: List of all detected patterns """ + # Commits are ordered newest -> older for this workflow commits = self.get_workflow_commits(workflow_name) if len(commits) < 3: return [] patterns = [] + # Slide a window centered at the suspected failing commit (i) + # We require: a newer commit with the same failure (i-1..0) and an older baseline (i+1..end) for i in range(1, len(commits) - 1): - suspected_commit1 = commits[i] # The commit we want to check for failure + suspected_commit1 = commits[i] if suspected_commit1.has_pending_jobs: continue + # Extract unique (classification_rule, normalized job) pairs for failing jobs on the suspected commit suspected_failures = { ( j.classification_rule, - j.name.split("(")[0], + suspected_commit1.normalize_job_name(j.name), ) for j in suspected_commit1.failed_jobs } - # Map to track newer commits for each failure + # Map failure -> the nearest newer commit where the same job failed with the same rule failure_to_newer_commit = {} for ( @@ -276,6 +299,7 @@ def detect_autorevert_pattern_workflow(self, workflow_name: str) -> List[Dict]: # Skip ignored classification rules continue + # Find the closest newer commit that ran this exact normalized job name newer_commit_same_job, newer_same_jobs = ( self._find_last_commit_with_job( (commits[j] for j in range(i - 1, -1, -1)), @@ -286,11 +310,16 @@ def detect_autorevert_pattern_workflow(self, workflow_name: str) -> List[Dict]: # No newer commit with the same job found continue - if any( - j.classification_rule == suspected_failure_class_rule - for j in newer_same_jobs + if ( + newer_commit_same_job + and newer_same_jobs + and any( + j.classification_rule == suspected_failure_class_rule + and j.conclusion == "failure" + for j in newer_same_jobs + ) ): - # The newer commit has the same job failing + # The newer commit has the same failure on the same job failure_key = ( suspected_failure_class_rule, suspected_failure_job_name, @@ -304,6 +333,7 @@ def detect_autorevert_pattern_workflow(self, workflow_name: str) -> List[Dict]: failure_rule, job_name, ), newer_commit in failure_to_newer_commit.items(): + # Find the first older commit that ran the same normalized job name last_commit_with_same_job, last_same_jobs = ( self._find_last_commit_with_job( (commits[j] for j in range(i + 1, len(commits))), job_name @@ -311,29 +341,37 @@ def detect_autorevert_pattern_workflow(self, workflow_name: str) -> List[Dict]: ) if not last_commit_with_same_job or not last_same_jobs: - # No older commit with the same job found + # No older commit with same normalized job name found continue # Ensure the oldest commit has stable signal (no running jobs) if last_commit_with_same_job.has_pending_jobs: continue - if any(j.classification_rule == failure_rule for j in last_same_jobs): - # The older commit has the same job failing with same rule + if any( + j.classification_rule == failure_rule and j.conclusion == "failure" + for j in last_same_jobs + ): + # Baseline already exhibits the same failure on this job -> not a commit-caused regression continue + # Record the detected pattern: (newer_fail, suspected_fail) contrasted against a clean baseline patterns.append( { "pattern_detected": True, "workflow_name": workflow_name, "failure_rule": failure_rule, + "job_name_base": job_name, "newer_commits": [ newer_commit.head_sha, suspected_commit1.head_sha, ], "older_commit": last_commit_with_same_job.head_sha, - "failed_job_names": [j.name for j in last_same_jobs], - "older_job_coverage": [], + "failed_job_names": [ + j.name + for j in suspected_commit1.failed_jobs + if j.classification_rule == failure_rule + ][:10], } ) break @@ -382,7 +420,120 @@ def detect_autorevert_pattern(self) -> List[Dict]: return all_patterns - def get_commits_reverted(self) -> Set[str]: + def _fetch_single_commit_jobs( + self, + workflow_name: str, + head_sha: str, + restarted_only: bool = False, + ) -> Optional[CommitJobs]: + """Fetch jobs for a single workflow+commit, optionally only restarted runs. + + Groups all jobs by head_sha (assumes at most one restart dispatch of interest). + Returns CommitJobs or None if no jobs found in lookback window. + """ + lookback_time = datetime.now() - timedelta(hours=self.lookback_hours) + + where_event = ( + "workflow_event = {we:String} AND head_branch LIKE {hb:String}" + if restarted_only + else "workflow_event != {we:String} AND head_branch = {hb:String}" + ) + + query = f""" + SELECT + head_sha, + name, + conclusion, + status, + torchci_classification.rule AS classification_rule, + created_at AS workflow_created_at + FROM workflow_job FINAL + WHERE workflow_name = {{workflow_name:String}} + AND head_sha = {{head_sha:String}} + AND {where_event} + AND created_at >= {{lookback_time:DateTime}} + AND dynamoKey LIKE 'pytorch/pytorch/%' + ORDER BY workflow_created_at DESC, name + """ + + hb = "trunk/%" if restarted_only else "main" + we = "workflow_dispatch" if restarted_only else "workflow_dispatch" + # Note: for non-restarted we exclude workflow_dispatch via != in WHERE above + + result = CHCliFactory().client.query( + query, + parameters={ + "workflow_name": workflow_name, + "head_sha": head_sha, + "we": we, + "hb": hb, + "lookback_time": lookback_time, + }, + ) + + rows = list(result.result_rows) + if not rows: + return None + + # Use the newest created_at among returned rows as the commit's created_at marker + latest_created = max(r[5] for r in rows) + cj = CommitJobs(head_sha=head_sha, created_at=latest_created, jobs=[]) + for row in rows: + _, name, conclusion, status, classification_rule, created_at = row + cj.jobs.append( + JobResult( + head_sha=head_sha, + name=name, + conclusion=conclusion, + status=status, + classification_rule=classification_rule or "", + workflow_created_at=created_at, + ) + ) + return cj + + def confirm_commit_caused_failure_on_restarted(self, pattern: Dict) -> bool: + """Confirm commit-caused failure using restarted runs. + + Requires that: + - first failing commit's restarted run has the same failure classification for the job + - previous commit's restarted run does NOT have that failure classification for the job + - both restarted runs have no pending jobs + """ + workflow_name = pattern["workflow_name"] + job_base = pattern.get("job_name_base") + failure_rule = pattern["failure_rule"] + first_failing = pattern["newer_commits"][1] + previous_commit = pattern["older_commit"] + + # Fetch restarted jobs for first failing and previous commits + failing_jobs = self._fetch_single_commit_jobs( + workflow_name, first_failing, restarted_only=True + ) + prev_jobs = self._fetch_single_commit_jobs( + workflow_name, previous_commit, restarted_only=True + ) + if not failing_jobs or not prev_jobs: + return False + + # Pending check + if failing_jobs.has_pending_jobs or prev_jobs.has_pending_jobs: + return False + + def has_rule(cj: CommitJobs, rule: str) -> bool: + return any( + cj.normalize_job_name(j.name) == job_base + and j.classification_rule == rule + and j.conclusion == "failure" + for j in cj.jobs + ) + + # Commit-caused if failing commit reproduces, previous does not + return has_rule(failing_jobs, failure_rule) and not has_rule( + prev_jobs, failure_rule + ) + + def _get_commits_reverted(self) -> Set[str]: """ Get all commits that were reverted within the lookback window. @@ -514,7 +665,7 @@ def extract_revert_categories_batch(self, messages: List[str]) -> Dict[str, str] return result - def get_commits_reverted_with_info(self) -> Dict[str, Dict]: + def _get_commits_reverted_with_info(self) -> Dict[str, Dict]: """ Get all commits that were reverted with detailed information including categories. diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/testers/autorevert.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/testers/autorevert.py index 87a0a90d24..21b239b9ae 100644 --- a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/testers/autorevert.py +++ b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/testers/autorevert.py @@ -1,3 +1,4 @@ +import logging from collections import defaultdict from ..autorevert_checker import AutorevertPatternChecker @@ -6,9 +7,10 @@ def autorevert_checker( workflow_names: list[str], + do_restart: bool, + do_revert: bool, hours: int = 48, verbose: bool = False, - do_restart: bool = False, dry_run: bool = False, ignore_common_errors=True, ): @@ -61,8 +63,8 @@ def autorevert_checker( # Detect patterns patterns = checker.detect_autorevert_pattern() - reverts = checker.get_commits_reverted() - reverts_with_info = checker.get_commits_reverted_with_info() + reverts = checker.commits_reverted + reverts_with_info = checker.commits_reverted_with_info # Categorize reverts reverts_by_category = defaultdict(set) @@ -90,6 +92,8 @@ def autorevert_checker( # Track reverts reverted_patterns = [] + false_positive = 0 + for i, pattern in enumerate(patterns, 1): print(f"\nPattern #{i}:") @@ -109,53 +113,76 @@ def autorevert_checker( f" - {additional['workflow_name']}: {additional['failure_rule']}" ) - # Check if the second commit (older of the two failures) was reverted - second_commit = pattern["newer_commits"][1] - revert_result = revert_checker.is_commit_reverted(second_commit) + # For clarity in naming + workflow_name = pattern["workflow_name"] + first_failing = pattern["newer_commits"][ + 1 + ] # the older of the two failing commits + previous_commit = pattern[ + "older_commit" + ] # previously successful commit for the matched job + revert_result = revert_checker.is_commit_reverted(first_failing) if revert_result: - not_found_reverts.discard(second_commit) - category = reverts_with_info.get(second_commit, {}).get( + not_found_reverts.discard(first_failing) + category = reverts_with_info.get(first_failing, {}).get( "category", "uncategorized" ) print( - f"✓ REVERTED ({category}): {second_commit} was reverted by {revert_result['revert_sha'][:8]} " + f"✓ REVERTED ({category}): {first_failing} was reverted by {revert_result['revert_sha'][:8]} " f"after {revert_result['hours_after_target']:.1f} hours" ) reverted_patterns.append(pattern) else: - print(f"✗ NOT REVERTED: {second_commit} was not reverted") + false_positive += 1 + print(f"✗ NOT REVERTED: {first_failing} was not reverted") # Try to restart workflow if --do-restart flag is set and not already reverted if do_restart and restart_checker: - # Restart for the second commit (older of the two failures) - workflow_name = pattern["workflow_name"] - - # Check if already restarted - if restart_checker.has_restarted_workflow( - workflow_name, second_commit - ): - print( - f" ⟳ ALREADY RESTARTED: {workflow_name} for {second_commit[:8]}" - ) - elif dry_run: - print( - f" ⟳ DRY RUN: Would restart {workflow_name} for {second_commit[:8]}" - ) - restarted_commits.append((workflow_name, second_commit)) - else: - success = restart_checker.restart_workflow( - workflow_name, second_commit - ) - if success: + # Restart the first failing (older failing) and the previous (successful) commit + for target_commit in (first_failing, previous_commit): + if restart_checker.has_restarted_workflow( + workflow_name, target_commit + ): print( - f" ✓ RESTARTED: {workflow_name} for {second_commit[:8]}" + f" ⟳ ALREADY RESTARTED: {workflow_name} for {target_commit[:8]}" ) - restarted_commits.append((workflow_name, second_commit)) - else: + continue + if dry_run: print( - f" ✗ FAILED TO RESTART: {workflow_name} for {second_commit[:8]}" + f" ⟳ DRY RUN: Would restart {workflow_name} for {target_commit[:8]}" + ) + restarted_commits.append((workflow_name, target_commit)) + else: + success = restart_checker.restart_workflow( + workflow_name, target_commit ) + if success: + print( + f" ✓ RESTARTED: {workflow_name} for {target_commit[:8]}" + ) + restarted_commits.append((workflow_name, target_commit)) + else: + print( + f" ✗ FAILED TO RESTART: {workflow_name} for {target_commit[:8]}" + ) + + # Secondary verification: compare first failing vs previous on restarted runs. + if do_revert: + try: + if checker.confirm_commit_caused_failure_on_restarted(pattern): + if dry_run: + print( + f" ⚠ DRY RUN: Would record REVERT for {first_failing[:8]} ({workflow_name})" + ) + else: + print( + f" ⚠ REVERT recorded for {first_failing[:8]} ({workflow_name})" + ) + except Exception as e: + logging.warning( + f"Secondary verification failed for {first_failing[:8]} ({workflow_name}): {e}" + ) if verbose: print(f"Failed jobs ({len(pattern['failed_job_names'])}):") @@ -164,11 +191,7 @@ def autorevert_checker( if len(pattern["failed_job_names"]) > 5: print(f" ... and {len(pattern['failed_job_names']) - 5} more") - print(f"Job coverage overlap ({len(pattern['older_job_coverage'])}):") - for job in pattern["older_job_coverage"][:3]: - print(f" - {job}") - if len(pattern["older_job_coverage"]) > 3: - print(f" ... and {len(pattern['older_job_coverage']) - 3} more") + # Job coverage overlap logging removed (older_job_coverage dropped from pattern) if revert_result and verbose: print(f"Revert message: {revert_result['revert_message'][:100]}...") @@ -187,10 +210,15 @@ def autorevert_checker( ) print(f"Commits checked: {total_commits}") + len_patterns = len(patterns) + len_reverted_patterns = len(reverted_patterns) + ratio_revert_patterns = ( + len_reverted_patterns / len_patterns if len_patterns > 0 else 0 + ) print(f"Auto revert patterns detected: {len(patterns)}") print( - "Actual reverts inside auto revert patterns detected (precision): " - + f"{len(reverted_patterns)} ({len(reverted_patterns) / len(patterns) * 100:.1f}%)" + "Actual reverts inside auto revert patterns detected (%): " + + f"{len_reverted_patterns} ({ratio_revert_patterns * 100:.1f}%)" ) print(f"Total revert commits in period: {len(reverts)}") @@ -217,13 +245,42 @@ def autorevert_checker( print(f"\nTotal reverts excluding ghfirst: {len(non_ghfirst_reverts)}") # Calculate recall based on non-ghfirst reverts only - if non_ghfirst_reverts: - print( - "Reverts (excluding ghfirst) that dont match any auto revert pattern detected (recall): " - + f"{len(not_found_non_ghfirst)} ({len(not_found_non_ghfirst) / len(non_ghfirst_reverts) * 100:.1f}%)" - ) - else: - print("No non-ghfirst reverts found in the period") + len_non_ghfirst_reverts = len(non_ghfirst_reverts) + len_not_found_non_ghfirst = len(not_found_non_ghfirst) + ratio_non_ghfirst_reverts = ( + len_not_found_non_ghfirst / len_non_ghfirst_reverts + if len_non_ghfirst_reverts > 0 + else 0 + ) + # recall_non_ghfirst = 1 - ratio_non_ghfirst_reverts + print( + "Reverts (excluding ghfirst) that dont match any auto revert pattern detected (%): " + + f"({len_not_found_non_ghfirst}) ({ratio_non_ghfirst_reverts * 100:.1f}%)" + ) + + len_reverts_with_info = len(reverts_with_info) + stats_precision = ( + len_reverted_patterns / len_patterns if len_patterns > 0 else 0.0 + ) + stats_recall = ( + len_reverted_patterns / len_reverts_with_info + if len_reverts_with_info > 0 + else 0.0 + ) + stats_f1 = ( + 2 * stats_precision * stats_recall / (stats_precision + stats_recall) + if (stats_precision + stats_recall) > 0 + else 0.0 + ) + + print() + print("*********************************************************************") + print("STATS SUMMARY:") + print(f" PRECISION: {stats_precision * 100:.1f}%") + print(f" RECALL: {stats_recall * 100:.1f}%") + print(f" F1: {stats_f1 * 100:.1f}%") + print("*********************************************************************") + print() workflow_statistics = defaultdict( lambda: {"match_pattern": 0, "reverts": 0, "reverts_non_ghfirst": 0} diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/tests/test_autorevert_detector.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/tests/test_autorevert_detector.py new file mode 100644 index 0000000000..09b891f0d1 --- /dev/null +++ b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/tests/test_autorevert_detector.py @@ -0,0 +1,98 @@ +import unittest +from datetime import datetime, timedelta + +from pytorch_auto_revert.autorevert_checker import ( + AutorevertPatternChecker, + CommitJobs, + JobResult, +) + + +class TestAutorevertDetectorROCM(unittest.TestCase): + def _make_job(self, sha, name, conclusion, rule="", status="completed", t=None): + return JobResult( + head_sha=sha, + name=name, + conclusion=conclusion, + status=status, + classification_rule=rule, + workflow_created_at=t or datetime.now(), + ) + + def test_rocm_realnames_success_rule_noise(self): + # Simulate the real ROCm case from ClickHouse where the baseline commit + # has success jobs but with rule='pytest failure', and two newer commits + # have single-shard failures with rule='pytest failure'. + + now = datetime.now() + sha_old = "33ec6e3e9aa2b93f7d907236aa10ba2b36355018" + sha_mid = "bbc0df1094b5a4dcd2cce83f8402127b07913231" + sha_new = "4fd5fabee9b2641440a413adf54f728fe2816375" + + # Common job names from the data + build = "linux-jammy-rocm-py3.10 / build" + shard1 = "linux-jammy-rocm-py3.10 / test (default, 1, 6, linux.rocm.gpu.2)" + shard2 = "linux-jammy-rocm-py3.10 / test (default, 2, 6, linux.rocm.gpu.2)" + shard3 = "linux-jammy-rocm-py3.10 / test (default, 3, 6, linux.rocm.gpu.2)" + shard4 = "linux-jammy-rocm-py3.10 / test (default, 4, 6, linux.rocm.gpu.2)" + shard5 = "linux-jammy-rocm-py3.10 / test (default, 5, 6, linux.rocm.gpu.2)" + shard6 = "linux-jammy-rocm-py3.10 / test (default, 6, 6, linux.rocm.gpu.2)" + + # Old (baseline) commit: all success, but some shards show a rule label + t_old = now - timedelta(hours=2) + old_jobs = [ + self._make_job(sha_old, build, "success", t=t_old), + self._make_job(sha_old, shard1, "success", rule="GHA error", t=t_old), + self._make_job(sha_old, shard2, "success", rule="pytest failure", t=t_old), + self._make_job(sha_old, shard3, "success", rule="GHA error", t=t_old), + self._make_job(sha_old, shard4, "success", rule="pytest failure", t=t_old), + self._make_job(sha_old, shard5, "success", rule="GHA error", t=t_old), + self._make_job(sha_old, shard6, "success", rule="pytest failure", t=t_old), + ] + + # Middle (first failing) commit: shard2 fails with pytest failure + t_mid = now - timedelta(hours=1) + mid_jobs = [ + self._make_job(sha_mid, build, "success", t=t_mid), + self._make_job(sha_mid, shard1, "success", rule="GHA error", t=t_mid), + self._make_job(sha_mid, shard2, "failure", rule="pytest failure", t=t_mid), + self._make_job(sha_mid, shard3, "success", rule="pytest failure", t=t_mid), + self._make_job(sha_mid, shard4, "success", rule="GHA error", t=t_mid), + self._make_job(sha_mid, shard5, "success", rule="pytest failure", t=t_mid), + self._make_job(sha_mid, shard6, "success", rule="pytest failure", t=t_mid), + ] + + # Newest (second failing) commit: shard5 fails with pytest failure + t_new = now + new_jobs = [ + self._make_job(sha_new, build, "success", t=t_new), + self._make_job(sha_new, shard1, "success", rule="GHA error", t=t_new), + self._make_job(sha_new, shard2, "success", rule="pytest failure", t=t_new), + self._make_job(sha_new, shard3, "success", rule="GHA error", t=t_new), + self._make_job(sha_new, shard4, "success", rule="pytest failure", t=t_new), + self._make_job(sha_new, shard5, "failure", rule="pytest failure", t=t_new), + self._make_job(sha_new, shard6, "success", rule="GHA error", t=t_new), + ] + + cj_old = CommitJobs(head_sha=sha_old, created_at=t_old, jobs=old_jobs) + cj_mid = CommitJobs(head_sha=sha_mid, created_at=t_mid, jobs=mid_jobs) + cj_new = CommitJobs(head_sha=sha_new, created_at=t_new, jobs=new_jobs) + + # Checker expects commits sorted newest->older; provide in that order + checker = AutorevertPatternChecker(["rocm"], lookback_hours=48) + checker._workflow_commits_cache = {"rocm": [cj_new, cj_mid, cj_old]} + + patterns = checker.detect_autorevert_pattern_workflow("rocm") + + self.assertGreaterEqual(len(patterns), 1, "Expected at least one pattern") + p = patterns[0] + self.assertEqual(p["workflow_name"], "rocm") + self.assertEqual(p["failure_rule"], "pytest failure") + # Order: [newer_commit, suspected_commit] + self.assertEqual(p["newer_commits"][0], sha_new) + self.assertEqual(p["newer_commits"][1], sha_mid) + self.assertEqual(p["older_commit"], sha_old) + + +if __name__ == "__main__": + unittest.main() diff --git a/aws/lambda/pytorch-auto-revert/requirements.txt b/aws/lambda/pytorch-auto-revert/requirements.txt index a2a8167e91..11fefa81b8 100644 --- a/aws/lambda/pytorch-auto-revert/requirements.txt +++ b/aws/lambda/pytorch-auto-revert/requirements.txt @@ -1,4 +1,5 @@ clickhouse-connect==0.8.14 +lazyproperty @ git+https://github.com/jeanschmidt/python_propertyutils@6a9083af8582e85c2bb30be85d22d8e0a88208c0 PyGithub==2.6.1 python-dotenv>=1.0.0 requests>=2.31.0