Skip to content

Commit 75212b8

Browse files
authored
[ch] optimize master_commit_red_percent query (#6580)
perf: ``` ./clickhouse_query_perf.py --query master_commit_red_percent --perf --times 1 --base HEAD +------+----------+-----------+-------------+---------------+-----------+------------+-------------+--------------+ | Test | Avg Time | Base Time | Time Change | % Time Change | Avg Mem | Base Mem | Mem Change | % Mem Change | +------+----------+-----------+-------------+---------------+-----------+------------+-------------+--------------+ | 0 | 1164 | 5585 | -4421 | -79 | 160555724 | 7544036697 | -7383480973 | -98 | | 1 | 2992 | 5483 | -2491 | -45 | 516092615 | 7858494950 | -7342402335 | -93 | +------+----------+-----------+-------------+---------------+-----------+------------+-------------+--------------+ ``` Removes expensive join, improves the pre-filtering utilizing a materialized view. ### Testing The new query in some cases returns slightly different results for `broken_trunk_red` metrics (although results for `all_red` metric are identical), which leads me to conclude that partitioning: ``` ROW_NUMBER() OVER( PARTITION BY j.name, j.head_sha ORDER BY j.run_attempt DESC ) AS row_num ``` is not stable (missing workflow_name? each row can have multiple pushes?) but overall the results are close enough (ε < 0.01) and further debugging and fixing could be made in a separate PR.
1 parent 971ac9d commit 75212b8

File tree

2 files changed

+46
-29
lines changed

2 files changed

+46
-29
lines changed

torchci/clickhouse_queries/master_commit_red_percent/params.json

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,24 @@
55
"stopTime": "DateTime64(3)",
66
"workflowNames": "Array(String)"
77
},
8-
"tests": []
8+
"tests": [
9+
{
10+
"granularity": "day",
11+
"startTime": "2025-03-18T21:09:47.987",
12+
"stopTime": "2025-03-25T21:09:47.987",
13+
"workflowNames": ["lint", "pull", "trunk"]
14+
},
15+
{
16+
"granularity": "day",
17+
"startTime": "2025-03-18T21:09:47.987",
18+
"stopTime": "2025-05-25T21:09:47.987",
19+
"workflowNames": ["lint", "pull", "trunk"]
20+
},
21+
{
22+
"granularity": "day",
23+
"startTime": "2025-03-24T00:00:00.000",
24+
"stopTime": "2025-03-25T00:00:00.000",
25+
"workflowNames": ["pull"]
26+
}
27+
]
928
}

torchci/clickhouse_queries/master_commit_red_percent/query.sql

Lines changed: 26 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,43 @@
1-
-- TODO (huydhn): This query tracks the number of red commits on HUD KPIs page. This
2-
-- is not the most efficient query around both in term of speed and memory usage. So,
3-
-- a good BE task is to re-write this in a more efficient way, HUD code is also
4-
-- subjected to change if need be
5-
WITH join_with_workflow_run AS (
6-
-- Do the join with workflow_run, then workflow_job to avoid OOM
7-
SELECT
8-
w.id AS id,
1+
-- huydhn: This query tracks the number of red commits on HUD KPIs page.
2+
WITH pushes AS ( -- very selective
3+
select
94
p.head_commit. 'timestamp' AS time,
105
p.head_commit. 'id' AS sha
11-
FROM
6+
from
127
default .push p FINAL
13-
JOIN default .workflow_run w FINAL ON w.head_commit. 'id' = p.head_commit. 'id'
14-
WHERE
15-
(
16-
-- Limit it to workflows which block viable/strict upgrades
17-
has({workflowNames: Array(String) }, lower(w.name))
18-
OR w.name like 'linux-binary%'
19-
)
20-
AND w.event != 'workflow_run' -- Filter out worflow_run-triggered jobs, which have nothing to do with the SHA
21-
AND p.ref = 'refs/heads/main'
22-
AND p.repository. 'owner'.'name' = 'pytorch'
23-
AND p.repository. 'name' = 'pytorch'
24-
AND p.head_commit. 'timestamp' >= {startTime: DateTime64(3) }
25-
AND p.head_commit. 'timestamp' < {stopTime: DateTime64(3) }
8+
where
9+
p.ref = 'refs/heads/main'
10+
and p.repository. 'owner'.'name' = 'pytorch'
11+
and p.repository. 'name' = 'pytorch'
12+
and p.head_commit. 'timestamp' >= {startTime: DateTime64(3) }
13+
and p.head_commit. 'timestamp' < {stopTime: DateTime64(3) }
2614
),
2715
all_jobs AS (
2816
SELECT
29-
w.time AS time,
17+
p.time AS time,
3018
j.conclusion AS conclusion,
31-
w.sha AS sha,
19+
j.head_sha AS sha,
3220
ROW_NUMBER() OVER(
3321
PARTITION BY j.name,
34-
w.sha
22+
j.head_sha
3523
ORDER BY
3624
j.run_attempt DESC
3725
) AS row_num
3826
FROM
39-
join_with_workflow_run w
40-
JOIN default .workflow_job j FINAL ON w.id = j.run_id
27+
default .workflow_job j FINAL
28+
join pushes p FINAL on j.head_sha = p.sha
4129
WHERE
42-
j.name != 'ciflow_should_run'
30+
j.id in (
31+
SELECT id FROM materialized_views.workflow_job_by_head_sha
32+
WHERE head_sha in (SELECT distinct p.sha FROM pushes p)
33+
)
34+
AND j.workflow_event != 'workflow_run' -- Filter out worflow_run-triggered jobs, which have nothing to do with the SHA
35+
AND (
36+
-- Limit it to jobs which block viable/strict upgrades
37+
has({workflowNames: Array(String) }, lower(j.workflow_name))
38+
OR j.workflow_name like 'linux-binary%'
39+
)
40+
AND j.name != 'ciflow_should_run'
4341
AND j.name != 'generate-test-matrix'
4442
AND j.name NOT LIKE '%rerun_disabled_tests%'
4543
AND j.name NOT LIKE '%unstable%'

0 commit comments

Comments
 (0)