Skip to content

Commit 5b770fb

Browse files
authored
[CF] Convert lf_rollover_health query to clickhouse (#5819)
Converts the lf_rollover_health query to use clickhouse. It also: - Corrects a bug in the query about how it calculated the average duration by fixing the denominator. - Removes the temporary amz2023 label prefix handling, since those workflows are obsolete now anyways Validation: - Ensured that the ClickHouse query showed the same results as the Rockset query (except for the duration comparison chart, which also shows matching results if I reintroduce the average duration bug)
1 parent 7188100 commit 5b770fb

File tree

2 files changed

+126
-149
lines changed

2 files changed

+126
-149
lines changed
Lines changed: 89 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -1,144 +1,106 @@
1-
-- !!! Query is not converted to CH syntax yet. Delete this line when it gets converted
2-
3-
WITH normalized_jobs AS (
1+
-- This query is used to generate a chart on HUD to compare the success, cancellation, and duration rates
2+
-- of jobs in the Meta fleet vs jobs in the LF fleet over time.
3+
WITH
4+
normalized_jobs AS (
45
SELECT
56
j.started_at,
6-
ROUND(DATE_DIFF('MINUTE', PARSE_TIMESTAMP_ISO8601(j.started_at), PARSE_TIMESTAMP_ISO8601(j.completed_at)), 1) as duration_min,
7-
if(
8-
strpos(l.label, 'amz2023.') = 0,
9-
l.label,
10-
CONCAT(
11-
substr(l.label, 1, strpos(l.label, 'amz2023.') - 1),
12-
substr(l.label, length('amz2023.') + strpos(l.label, 'amz2023.'))
13-
)
14-
) as label,
15-
REGEXP_EXTRACT(j.name, '([^,]*),?', 1) as job_name, -- remove shard number and label from job names
7+
ROUND(dateDiff('minute', j.started_at, j.completed_at), 1) as duration_min,
8+
l as label,
9+
extract(j.name, '[^,]*') as job_name, -- remove shard number and label from job names
1610
j.workflow_name,
1711
j.conclusion,
18-
DATE_TRUNC(:granularity, PARSE_TIMESTAMP_ISO8601(j.started_at))
19-
AS bucket,
12+
toStartOfInterval(j.started_at, INTERVAL 1 DAY) AS bucket
2013
FROM
21-
commons.workflow_job j
22-
CROSS JOIN UNNEST(j.labels as label) as l
23-
WHERE 1=1
24-
AND j.labels is not NULL
25-
AND j._event_time > CURRENT_DATETIME() - DAYS(:days_ago)
14+
-- Deliberatly not adding FINAL to this workflow_job.
15+
-- Risks of not using it:
16+
-- - You may get duplicate records for rows that were updated corresponding to their
17+
-- before/after states, but as long as there’s some mechanism in the query to account
18+
-- for that it’s okay (we check for j.status = 'completed`).
19+
-- - In the worst case scenario, you may only see the ‘old’ version of the records for some rows
20+
-- Costs of using it:
21+
-- - Query procesing time increases from ~5 -> 16 seconds
22+
-- - Memory usage grows from 7.5 GB -> 32 GB
23+
-- So the tradeoff is worth it for this query.
24+
workflow_job as j
25+
ARRAY JOIN
26+
j.labels as l
27+
WHERE
28+
j.labels IS NOT NULL -- prob unnecessary now
29+
AND j.created_at > now() - interval {days_ago: Int64} day
2630
AND j.status = 'completed'
27-
AND l.label != 'self-hosted'
28-
AND l.label not like 'lf.c.%'
29-
AND l.label not like '%canary%'
30-
31-
), migrated_jobs AS (
31+
AND l != 'self-hosted'
32+
AND l NOT LIKE 'lf.c.%'
33+
AND l NOT LIKE '%canary%'
34+
),
35+
lf_jobs AS (
3236
SELECT DISTINCT
3337
j.job_name
3438
FROM
35-
normalized_jobs j
36-
WHERE 1=1
37-
AND j.label like 'lf%'
38-
), comparable_jobs AS (
39-
SELECT
40-
-- count(*)
41-
j.bucket,
42-
j.started_at,
43-
j.duration_min,-- -- j.completed_at,
44-
j.label,
45-
j.job_name, -- remove shard number and label from job names
46-
j.workflow_name,
47-
j.conclusion,
48-
FROM
49-
normalized_jobs j
50-
CROSS JOIN migrated_jobs mj
51-
WHERE 1 = 1
52-
AND j.job_name = mj.job_name
53-
-- AND STRPOS(j.name, mj.job_clean) > 0
54-
55-
), success_stats AS (
39+
normalized_jobs as j
40+
WHERE
41+
j.label LIKE 'lf%'
42+
),
43+
comparable_jobs AS (
44+
SELECT
45+
j.bucket,
46+
j.started_at,
47+
j.duration_min,
48+
j.label,
49+
j.job_name,
50+
j.workflow_name,
51+
j.conclusion
52+
FROM
53+
normalized_jobs as j
54+
CROSS JOIN lf_jobs as lfj
55+
WHERE
56+
j.job_name = lfj.job_name
57+
),
58+
success_stats AS (
5659
SELECT
5760
bucket,
5861
count(*) as group_size,
5962
job_name,
6063
workflow_name,
6164
label,
62-
IF(SUBSTR(label, 1, 3) = 'lf.', True, False ) as lf_fleet,
63-
SUM(
64-
CASE
65-
WHEN conclusion = 'success' THEN 1
66-
ELSE 0
67-
END
68-
) * 100 / (COUNT_IF(conclusion != 'cancelled') + 1) as success_rate, -- plus one is to handle divide by zero errors
69-
SUM(
70-
CASE
71-
WHEN conclusion = 'failure' THEN 1
72-
ELSE 0
73-
END
74-
) * 100 / (COUNT_IF(conclusion != 'cancelled') + 1) as failure_rate,
75-
SUM(
76-
CASE
77-
WHEN conclusion = 'cancelled' THEN 1
78-
ELSE 0
79-
END
80-
) * 100 / COUNT(*) as cancelled_rate, -- cancelled rate is calculated over all jobs
81-
SUM(
82-
CASE
83-
WHEN conclusion = 'success' THEN 1
84-
ELSE 0
85-
END
86-
) as success_count,
87-
SUM(
88-
CASE
89-
WHEN conclusion = 'failure' THEN 1
90-
ELSE 0
91-
END
92-
) as failure_count,
93-
SUM(
94-
CASE
95-
WHEN conclusion = 'cancelled' THEN 1
96-
ELSE 0
97-
END
98-
) as cancelled_count,
99-
COUNT(*) as total_count,
100-
SUM(
101-
CASE
102-
WHEN conclusion = 'success' THEN duration_min
103-
ELSE 0
104-
END
105-
) / COUNT(*) as success_avg_duration,
106-
SUM(
107-
CASE
108-
WHEN conclusion = 'failure' THEN duration_min
109-
ELSE 0
110-
END
111-
) / COUNT(*) as failure_avg_duration,
112-
SUM(
113-
CASE
114-
WHEN conclusion = 'cancelled' THEN duration_min
115-
ELSE 0
116-
END
117-
) / COUNT(*) as cancelled_avg_duration,
118-
65+
if(startsWith(label, 'lf.'), 1, 0 ) as lf_fleet,
66+
countIf(conclusion = 'success') * 100 / (countIf(conclusion != 'cancelled') + 1) as success_rate, -- plus one is to handle divide by zero errors
67+
countIf(conclusion = 'failure') * 100 / (countIf(conclusion != 'cancelled') + 1) as failure_rate,
68+
countIf(conclusion = 'cancelled') * 100 / COUNT() as cancelled_rate,
69+
avgIf(duration_min, conclusion = 'success') as success_avg_duration,
70+
avgIf(duration_min, conclusion = 'failure') as failure_avg_duration,
71+
avgIf(duration_min, conclusion = 'cancelled') as cancelled_avg_duration
11972
FROM comparable_jobs
120-
GROUP BY
121-
bucket, job_name, workflow_name, label
122-
), comparison_stats AS (
123-
SELECT
124-
lf.bucket,
125-
lf.workflow_name,
126-
lf.job_name,
127-
lf.group_size as sample_size_lf,
128-
m.group_size as sample_size_meta,
129-
lf.success_rate - m.success_rate as success_rate_delta,
130-
lf.failure_rate - m.failure_rate as failure_rate_delta,
131-
lf.cancelled_rate - m.cancelled_rate as cancelled_rate_delta,
132-
IF(m.success_avg_duration = 0, 1, ROUND(lf.success_avg_duration * 1.0 / m.success_avg_duration, 2)) as success_duration_increase_ratio,
133-
FROM success_stats lf
134-
INNER JOIN success_stats m on lf.bucket = m.bucket
135-
WHERE 1 = 1
136-
AND lf.job_name = m.job_name
137-
AND lf.workflow_name = m.workflow_name
138-
AND lf.lf_fleet = True
139-
AND m.lf_fleet = False
140-
AND lf.group_size > 3
141-
AND m.group_size > 3
73+
GROUP BY bucket, job_name, workflow_name, label
74+
),
75+
comparison_stats AS (
76+
SELECT
77+
lf.bucket,
78+
lf.workflow_name,
79+
lf.job_name,
80+
lf.group_size as sample_size_lf,
81+
m.group_size as sample_size_meta,
82+
lf.success_rate - m.success_rate as success_rate_delta,
83+
lf.failure_rate - m.failure_rate as failure_rate_delta,
84+
lf.cancelled_rate - m.cancelled_rate as cancelled_rate_delta,
85+
if(m.success_avg_duration = 0, 1, round(lf.success_avg_duration / m.success_avg_duration, 2)) as success_duration_increase_ratio
86+
FROM
87+
success_stats as lf
88+
JOIN
89+
success_stats as m
90+
ON
91+
lf.bucket = m.bucket
92+
AND lf.job_name = m.job_name
93+
AND lf.workflow_name = m.workflow_name
94+
WHERE
95+
lf.lf_fleet = 1
96+
AND m.lf_fleet = 0
97+
-- the group size limit reduces noise from low sample sizes
98+
AND lf.group_size > 3
99+
AND m.group_size > 3
142100
)
143-
SELECT * from comparison_stats
144-
ORDER by bucket desc, job_name desc, success_rate_delta, workflow_name
101+
SELECT
102+
*
103+
FROM
104+
comparison_stats
105+
ORDER BY
106+
bucket DESC, job_name DESC, success_rate_delta, workflow_name

torchci/pages/metrics.tsx

Lines changed: 37 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1144,7 +1144,7 @@ export default function Page() {
11441144
</Typography>
11451145
<p>
11461146
These panels show the <b>delta</b> between states of the same job
1147-
run on the Linux Foundation and Meta fleets.
1147+
run on the Linux Foundation vs the Meta fleets.
11481148
</p>
11491149
</Grid>
11501150

@@ -1153,19 +1153,24 @@ export default function Page() {
11531153
title={"LF vs Meta: Success rate delta"}
11541154
queryName={"lf_rollover_health"}
11551155
queryCollection={"metrics"}
1156-
queryParams={[
1157-
{
1158-
name: "days_ago",
1159-
type: "int",
1160-
value: timeRange,
1161-
},
1162-
]}
1156+
queryParams={
1157+
useClickHouse
1158+
? { ...timeParamsClickHouse, days_ago: timeRange }
1159+
: [
1160+
{
1161+
name: "days_ago",
1162+
type: "int",
1163+
value: timeRange,
1164+
},
1165+
]
1166+
}
11631167
granularity={"day"}
11641168
timeFieldName={"bucket"}
11651169
yAxisLabel={"rate delta"}
11661170
yAxisFieldName={"success_rate_delta"}
11671171
yAxisRenderer={(value) => value}
11681172
groupByFieldName={"job_name"}
1173+
useClickHouse={useClickHouse}
11691174
/>
11701175
</Grid>
11711176

@@ -1174,19 +1179,24 @@ export default function Page() {
11741179
title={"LF vs Meta: Cancelled rate delta"}
11751180
queryName={"lf_rollover_health"}
11761181
queryCollection={"metrics"}
1177-
queryParams={[
1178-
{
1179-
name: "days_ago",
1180-
type: "int",
1181-
value: timeRange,
1182-
},
1183-
]}
1182+
queryParams={
1183+
useClickHouse
1184+
? { ...timeParamsClickHouse, days_ago: timeRange }
1185+
: [
1186+
{
1187+
name: "days_ago",
1188+
type: "int",
1189+
value: timeRange,
1190+
},
1191+
]
1192+
}
11841193
granularity={"day"}
11851194
timeFieldName={"bucket"}
11861195
yAxisLabel={"rate delta"}
11871196
yAxisFieldName={"cancelled_rate_delta"}
11881197
yAxisRenderer={(value) => value}
11891198
groupByFieldName={"job_name"}
1199+
useClickHouse={useClickHouse}
11901200
/>
11911201
</Grid>
11921202

@@ -1195,19 +1205,24 @@ export default function Page() {
11951205
title={"LF vs Meta: Duration increase ratio"}
11961206
queryName={"lf_rollover_health"}
11971207
queryCollection={"metrics"}
1198-
queryParams={[
1199-
{
1200-
name: "days_ago",
1201-
type: "int",
1202-
value: timeRange,
1203-
},
1204-
]}
1208+
queryParams={
1209+
useClickHouse
1210+
? { ...timeParamsClickHouse, days_ago: timeRange }
1211+
: [
1212+
{
1213+
name: "days_ago",
1214+
type: "int",
1215+
value: timeRange,
1216+
},
1217+
]
1218+
}
12051219
granularity={"day"}
12061220
timeFieldName={"bucket"}
12071221
yAxisLabel="increase ratio"
12081222
yAxisFieldName={"success_duration_increase_ratio"}
12091223
yAxisRenderer={(value) => value}
12101224
groupByFieldName={"job_name"}
1225+
useClickHouse={useClickHouse}
12111226
/>
12121227
</Grid>
12131228
<Grid item xs={12} height={ROW_HEIGHT}>

0 commit comments

Comments
 (0)