Skip to content

Commit c60987b

Browse files
authored
[autorevert] fix query sorting (#7043)
Current sorting uses the workflow dispatch time, what does not match the order for commit sequence. The correct approach is to sort by merge timestamp for all workflows. This was causing errors in the detection logic, as it was mixing the order of jobs for commit evaluation, detecting rules where it should not. ``` ================================================== SUMMARY STATISTICS ================================================== Workflow(s): Lint, trunk, pull, inductor, linux-binary-manywheel Timeframe: 4380 hours Commits checked: 33873 Auto revert patterns detected: 560 Actual reverts inside auto revert patterns detected (%): 204 (36.4%) Total revert commits in period: 601 Revert categories: nosignal: 215 (35.8%) ghfirst: 151 (25.1%) uncategorized: 105 (17.5%) ignoredsignal: 70 (11.6%) weird: 46 (7.7%) landrace: 14 (2.3%) Total reverts excluding ghfirst: 450 Reverts (excluding ghfirst) that dont match any auto revert pattern detected (%): (268) (59.6%) ********************************************************************* STATS SUMMARY: PRECISION: 36.4% RECALL: 33.9% F1: 35.1% ********************************************************************* Per workflow precision: Lint: 50 reverts out of 60 patterns (83.3%) [excluding ghfirst: 46 (76.7%)] trunk: 40 reverts out of 74 patterns (54.1%) [excluding ghfirst: 37 (50.0%)] pull: 79 reverts out of 276 patterns (28.6%) [excluding ghfirst: 74 (26.8%)] inductor: 34 reverts out of 144 patterns (23.6%) [excluding ghfirst: 31 (21.5%)] linux-binary-manywheel: 1 reverts out of 6 patterns (16.7%) [excluding ghfirst: 0 (0.0%)] ```
1 parent d3977e8 commit c60987b

File tree

1 file changed

+41
-29
lines changed

1 file changed

+41
-29
lines changed

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/autorevert_checker.py

Lines changed: 41 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -125,27 +125,38 @@ def _fetch_workflow_data(self):
125125
f"Fetching workflow data for {len(self.workflow_names)} workflows since {lookback_time.isoformat()}..."
126126
)
127127

128-
# For pattern detection we consider non-restarted main branch jobs only
129-
base_where = "workflow_event != 'workflow_dispatch' AND head_branch = 'main'"
130-
131-
query = f"""
132-
SELECT
133-
workflow_name,
134-
head_sha,
135-
name,
136-
conclusion,
137-
status,
138-
torchci_classification.rule AS classification_rule,
139-
created_at AS workflow_created_at
140-
FROM
141-
workflow_job FINAL
142-
WHERE
143-
workflow_name IN {{workflow_names:Array(String)}}
144-
AND {base_where}
145-
AND created_at >= {{lookback_time:DateTime}}
146-
AND dynamoKey LIKE 'pytorch/pytorch/%'
147-
ORDER BY
148-
workflow_name, workflow_created_at DESC, head_sha, name
128+
query = """
129+
SELECT
130+
wf.workflow_name,
131+
wf.head_sha,
132+
wf.name,
133+
wf.conclusion,
134+
wf.status,
135+
wf.torchci_classification.rule AS classification_rule,
136+
wf.created_at AS workflow_created_at
137+
FROM workflow_job AS wf FINAL
138+
INNER JOIN (
139+
-- Deduplicate pushes by head_sha using group+max,
140+
-- keeping the most recent timestamp
141+
-- this is faster than using distinct
142+
SELECT
143+
head_commit.id as sha,
144+
max(head_commit.timestamp) as timestamp
145+
FROM default.push
146+
WHERE head_commit.timestamp >= {lookback_time:DateTime}
147+
AND ref = 'refs/heads/main'
148+
GROUP BY sha
149+
) AS push_dedup ON wf.head_sha = push_dedup.sha
150+
WHERE
151+
wf.workflow_name IN {workflow_names:Array(String)}
152+
AND wf.workflow_event != 'workflow_dispatch'
153+
AND wf.head_branch = 'main'
154+
-- this timestamp should always be bigger than push_dedup.timestamp
155+
-- it is just a optimization as this column is indexed
156+
AND wf.created_at >= {lookback_time:DateTime}
157+
AND wf.dynamoKey LIKE 'pytorch/pytorch/%'
158+
ORDER BY
159+
wf.workflow_name, push_dedup.timestamp DESC, wf.head_sha, wf.name
149160
"""
150161

151162
result = CHCliFactory().client.query(
@@ -207,14 +218,15 @@ def _fetch_commit_history(self):
207218
lookback_time = datetime.now() - timedelta(hours=self.lookback_hours)
208219

209220
query = """
210-
SELECT DISTINCT
211-
head_commit.id as sha,
212-
head_commit.message as message,
213-
head_commit.timestamp as timestamp
214-
FROM default.push
215-
WHERE head_commit.timestamp >= {lookback_time:DateTime}
216-
AND ref = 'refs/heads/main'
217-
ORDER BY head_commit.timestamp DESC
221+
SELECT
222+
head_commit.id as sha,
223+
head_commit.message as message,
224+
max(head_commit.timestamp) as timestamp
225+
FROM default.push
226+
WHERE head_commit.timestamp >= {lookback_time:DateTime}
227+
AND ref = 'refs/heads/main'
228+
GROUP BY sha, message
229+
ORDER BY timestamp DESC
218230
"""
219231

220232
result = CHCliFactory().client.query(

0 commit comments

Comments
 (0)