Skip to content

Commit bc704b6

Browse files
authored
[autorevert] Add workflow restarts (#6962)
### Summary Adds workflow restart capability to the PyTorch auto-revert tool, enabling automatic re-running of workflows for commits that match autorevert patterns but haven't been reverted yet. ### Changes - Added restart methods to WorkflowRestartChecker: - restart_workflow(): Restarts a workflow for a specific commit with duplicate prevention - Checks ClickHouse for existing restarts before attempting - Enhanced autorevert-checker command: - Added --do-restart flag to enable automatic workflow restarts - Added --dry-run flag to preview restart actions without execution - Restarts workflows only for non-reverted commits matching autorevert patterns - Fixed workflow naming consistency: - Normalized workflow names by removing .yml extension for ClickHouse queries - Added .yml extension only for GitHub API calls - Updated do-restart command: - Now requires commit SHA (removed unused restart_latest_workflow) - Leverages same restart logic with duplicate prevention ### Usage ``` # Check patterns and restart workflows python -m pytorch_auto_revert autorevert-checker pull trunk --do-restart # Dry run to preview restarts python -m pytorch_auto_revert autorevert-checker pull trunk --do-restart --dry-run # Manual restart python -m pytorch_auto_revert do-restart trunk abc123def ``` ### Testing ``` python -m pytorch_auto_revert autorevert-checker inductor --hours 12 --do-restart --dry-run Fetching workflow data for 1 workflows since 2025-07-30T22:47:17.357776... Found 19 commits with job data for workflow 'inductor' ✓ 1 AUTOREVERT PATTERN DETECTED Pattern #1: Failure rule: 'GHA error' Recent commits with failure: f89c28cc 5b2ad927 Older commit without failure: 7a4167a1 ✗ NOT REVERTED: 5b2ad9279cb2e440d45253d28f2101a75fd42344 was not reverted ⟳ ALREADY RESTARTED: inductor for 5b2ad927 ================================================== SUMMARY STATISTICS ================================================== Workflow(s): inductor Timeframe: 12 hours Commits checked: 19 Auto revert patterns detected: 1 Actual reverts inside auto revert patterns detected (precision): 0 (0.0%) Total revert commits in period: 0 Total reverts excluding ghfirst: 0 No non-ghfirst reverts found in the period Per workflow precision: inductor: 0 reverts out of 1 patterns (0.0%) [excluding ghfirst: 0 (0.0%)] ```
1 parent 9492bbd commit bc704b6

File tree

5 files changed

+155
-18
lines changed

5 files changed

+155
-18
lines changed

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/__main__.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,16 @@ def get_opts() -> argparse.Namespace:
8484
action="store_true",
8585
help="Show detailed output including commit summaries",
8686
)
87+
workflow_parser.add_argument(
88+
"--do-restart",
89+
action="store_true",
90+
help="Actually restart workflows for detected autorevert patterns",
91+
)
92+
workflow_parser.add_argument(
93+
"--dry-run",
94+
action="store_true",
95+
help="Show what would be restarted without actually doing it (use with --do-restart)",
96+
)
8797

8898
# workflow-restart-checker subcommand
8999
workflow_restart_parser = subparsers.add_parser(
@@ -146,7 +156,13 @@ def main(*args, **kwargs) -> None:
146156
if opts.subcommand == "lambda":
147157
print("TODO: run lambda flow")
148158
elif opts.subcommand == "autorevert-checker":
149-
autorevert_checker(opts.workflows, hours=opts.hours, verbose=opts.verbose)
159+
autorevert_checker(
160+
opts.workflows,
161+
hours=opts.hours,
162+
verbose=opts.verbose,
163+
do_restart=opts.do_restart,
164+
dry_run=opts.dry_run,
165+
)
150166
elif opts.subcommand == "workflow-restart-checker":
151167
workflow_restart_checker(opts.workflow, commit=opts.commit, days=opts.days)
152168
elif opts.subcommand == "do-restart":

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/testers/autorevert.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
from collections import defaultdict
22

33
from ..autorevert_checker import AutorevertPatternChecker
4+
from ..workflow_checker import WorkflowRestartChecker
45

56

67
def autorevert_checker(
7-
workflow_names: list[str], hours: int = 48, verbose: bool = False
8+
workflow_names: list[str],
9+
hours: int = 48,
10+
verbose: bool = False,
11+
do_restart: bool = False,
12+
dry_run: bool = False,
813
):
914
# Initialize checker
1015
checker = AutorevertPatternChecker(workflow_names, hours)
@@ -67,6 +72,10 @@ def autorevert_checker(
6772
workflow_names=[], lookback_hours=hours * 2
6873
)
6974

75+
# Initialize workflow restart checker if needed
76+
restart_checker = WorkflowRestartChecker() if do_restart else None
77+
restarted_commits = []
78+
7079
# Track reverts
7180
reverted_patterns = []
7281

@@ -106,6 +115,37 @@ def autorevert_checker(
106115
else:
107116
print(f"✗ NOT REVERTED: {second_commit} was not reverted")
108117

118+
# Try to restart workflow if --do-restart flag is set and not already reverted
119+
if do_restart and restart_checker:
120+
# Restart for the second commit (older of the two failures)
121+
workflow_name = pattern["workflow_name"]
122+
123+
# Check if already restarted
124+
if restart_checker.has_restarted_workflow(
125+
workflow_name, second_commit
126+
):
127+
print(
128+
f" ⟳ ALREADY RESTARTED: {workflow_name} for {second_commit[:8]}"
129+
)
130+
elif dry_run:
131+
print(
132+
f" ⟳ DRY RUN: Would restart {workflow_name} for {second_commit[:8]}"
133+
)
134+
restarted_commits.append((workflow_name, second_commit))
135+
else:
136+
success = restart_checker.restart_workflow(
137+
workflow_name, second_commit
138+
)
139+
if success:
140+
print(
141+
f" ✓ RESTARTED: {workflow_name} for {second_commit[:8]}"
142+
)
143+
restarted_commits.append((workflow_name, second_commit))
144+
else:
145+
print(
146+
f" ✗ FAILED TO RESTART: {workflow_name} for {second_commit[:8]}"
147+
)
148+
109149
if verbose:
110150
print(f"Failed jobs ({len(pattern['failed_job_names'])}):")
111151
for job in pattern["failed_job_names"][:5]:
@@ -216,6 +256,12 @@ def autorevert_checker(
216256
f" - {pattern['failure_rule']}: {second_commit[:8]} ({category})"
217257
)
218258

259+
# Show restart summary if applicable
260+
if do_restart and restarted_commits:
261+
print(f"\nRestarted workflows: {len(restarted_commits)}")
262+
for workflow, commit in restarted_commits:
263+
print(f" - {workflow} for {commit[:8]}")
264+
219265
else:
220266
print("✗ No autorevert patterns detected")
221267

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,9 @@
11
from ..workflow_checker import WorkflowRestartChecker
22

33

4-
def do_restart_workflow(workflow: str, commit: str = None) -> None:
4+
def do_restart_workflow(workflow: str, commit: str) -> None:
55
checker = WorkflowRestartChecker()
66

7-
if commit:
8-
# Restart specific commit
9-
success = checker.restart_workflow(workflow, commit)
10-
print(f"Commit {commit}: {'✓ RESTARTED' if success else '✗ Not restarted'}")
11-
else:
12-
# Restart latest commit
13-
success = checker.restart_latest_workflow(workflow)
14-
print(f"Latest commit: {'✓ RESTARTED' if success else '✗ Not restarted'}")
7+
# Restart specific commit
8+
success = checker.restart_workflow(workflow, commit)
9+
print(f"Commit {commit}: {'✓ RESTARTED' if success else '✗ Not restarted'}")

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/workflow_checker.py

Lines changed: 86 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
WorkflowRestartChecker for querying restarted workflows via ClickHouse.
33
"""
44

5+
import logging
56
from datetime import datetime, timedelta
67
from typing import Dict, Set
78

@@ -11,21 +12,25 @@
1112
class WorkflowRestartChecker:
1213
"""Check if workflows have been restarted using ClickHouse."""
1314

14-
def __init__(self):
15+
def __init__(self, repo_owner: str = "pytorch", repo_name: str = "pytorch"):
1516
self._cache: Dict[str, bool] = {}
17+
self.repo_owner = repo_owner
18+
self.repo_name = repo_name
1619

1720
def has_restarted_workflow(self, workflow_name: str, commit_sha: str) -> bool:
1821
"""
1922
Check if a workflow has been restarted for given commit.
2023
2124
Args:
22-
workflow_name: Name of workflow (e.g., "trunk")
25+
workflow_name: Name of workflow (e.g., "trunk" or "trunk.yml")
2326
commit_sha: Commit SHA to check
2427
2528
Returns:
2629
bool: True if workflow was restarted (workflow_dispatch with trunk/* branch)
2730
"""
28-
cache_key = f"{workflow_name}:{commit_sha}"
31+
# Normalize workflow name - remove .yml extension for consistency
32+
normalized_workflow_name = workflow_name.replace(".yml", "")
33+
cache_key = f"{normalized_workflow_name}:{commit_sha}"
2934
if cache_key in self._cache:
3035
return self._cache[cache_key]
3136

@@ -49,7 +54,7 @@ def has_restarted_workflow(self, workflow_name: str, commit_sha: str) -> bool:
4954
"commit_sha": commit_sha,
5055
"workflow_event": "workflow_dispatch",
5156
"head_branch": f"trunk/{commit_sha}",
52-
"workflow_name": workflow_name,
57+
"workflow_name": normalized_workflow_name,
5358
},
5459
)
5560

@@ -62,12 +67,14 @@ def get_restarted_commits(self, workflow_name: str, days_back: int = 7) -> Set[s
6267
Get all commits with restarted workflows in date range.
6368
6469
Args:
65-
workflow_name: Name of workflow
70+
workflow_name: Name of workflow (e.g., "trunk" or "trunk.yml")
6671
days_back: Number of days to look back
6772
6873
Returns:
6974
Set of commit SHAs that have restarted workflows
7075
"""
76+
# Normalize workflow name - remove .yml extension for consistency
77+
normalized_workflow_name = workflow_name.replace(".yml", "")
7178
since_date = datetime.now() - timedelta(days=days_back)
7279

7380
query = """
@@ -80,18 +87,90 @@ def get_restarted_commits(self, workflow_name: str, days_back: int = 7) -> Set[s
8087
"""
8188

8289
result = CHCliFactory().client.query(
83-
query, {"workflow_name": workflow_name, "since_date": since_date}
90+
query, {"workflow_name": normalized_workflow_name, "since_date": since_date}
8491
)
8592

8693
commits = {row[0] for row in result.result_rows}
8794

8895
# Update cache
8996
for commit_sha in commits:
90-
cache_key = f"{workflow_name}:{commit_sha}"
97+
cache_key = f"{normalized_workflow_name}:{commit_sha}"
9198
self._cache[cache_key] = True
9299

93100
return commits
94101

95102
def clear_cache(self):
96103
"""Clear the results cache."""
97104
self._cache.clear()
105+
106+
def restart_workflow(self, workflow_name: str, commit_sha: str) -> bool:
107+
"""
108+
Restart a workflow for a specific commit SHA.
109+
110+
Args:
111+
workflow_name: Name of the workflow (e.g., "trunk" or "trunk.yml")
112+
commit_sha: The commit SHA to restart workflow for
113+
114+
Returns:
115+
bool: True if workflow was successfully dispatched, False otherwise
116+
"""
117+
# Normalize workflow name
118+
normalized_workflow_name = workflow_name.replace(".yml", "")
119+
120+
# Check if already restarted
121+
if self.has_restarted_workflow(normalized_workflow_name, commit_sha):
122+
logging.warning(
123+
f"Workflow {normalized_workflow_name} already restarted for commit {commit_sha}"
124+
)
125+
return False
126+
127+
# Get GitHub client
128+
try:
129+
from .github_client_helper import GHClientFactory
130+
131+
if not (
132+
GHClientFactory().token_auth_provided
133+
or GHClientFactory().key_auth_provided
134+
):
135+
logging.error("GitHub authentication not configured")
136+
return False
137+
138+
client = GHClientFactory().client
139+
except Exception as e:
140+
logging.error(f"Failed to get GitHub client: {e}")
141+
return False
142+
143+
try:
144+
# Use trunk/{sha} tag format
145+
tag_ref = f"trunk/{commit_sha}"
146+
147+
# Add .yml extension for workflow name
148+
workflow_file_name = f"{normalized_workflow_name}.yml"
149+
150+
# Get repo and workflow objects
151+
repo = client.get_repo(f"{self.repo_owner}/{self.repo_name}")
152+
workflow = repo.get_workflow(workflow_file_name)
153+
154+
# Dispatch the workflow
155+
workflow.create_dispatch(ref=tag_ref, inputs={})
156+
157+
# Construct the workflow runs URL
158+
workflow_url = (
159+
f"https://github.com/{self.repo_owner}/{self.repo_name}"
160+
f"/actions/workflows/{workflow_file_name}"
161+
f"?query=branch%3Atrunk%2F{commit_sha}"
162+
)
163+
logging.info(
164+
f"Successfully dispatched workflow {normalized_workflow_name} for commit {commit_sha}\n"
165+
f" View at: {workflow_url}"
166+
)
167+
168+
# Invalidate cache for this workflow/commit
169+
cache_key = f"{normalized_workflow_name}:{commit_sha}"
170+
if cache_key in self._cache:
171+
del self._cache[cache_key]
172+
return True
173+
174+
except Exception as e:
175+
logging.error(f"Error dispatching workflow {normalized_workflow_name}: {e}")
176+
return False
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
clickhouse-connect==0.8.14
22
PyGithub==2.6.1
33
python-dotenv>=1.0.0
4+
requests>=2.31.0

0 commit comments

Comments
 (0)