1- -- !!! Query is not converted to CH syntax yet. Delete this line when it gets converted
2-
3- WITH normalized_jobs AS (
1+ -- This query is used to generate a chart on HUD to compare the success, cancellation, and duration rates
2+ -- of jobs in the Meta fleet vs jobs in the LF fleet over time.
3+ WITH
4+ normalized_jobs AS (
45 SELECT
56 j .started_at ,
6- ROUND(DATE_DIFF(' MINUTE' , PARSE_TIMESTAMP_ISO8601(j .started_at ), PARSE_TIMESTAMP_ISO8601(j .completed_at )), 1 ) as duration_min,
7- if(
8- strpos(l .label , ' amz2023.' ) = 0 ,
9- l .label ,
10- CONCAT(
11- substr(l .label , 1 , strpos(l .label , ' amz2023.' ) - 1 ),
12- substr(l .label , length(' amz2023.' ) + strpos(l .label , ' amz2023.' ))
13- )
14- ) as label,
15- REGEXP_EXTRACT(j .name , ' ([^,]*),?' , 1 ) as job_name, -- remove shard number and label from job names
7+ ROUND(dateDiff(' minute' , j .started_at , j .completed_at ), 1 ) as duration_min,
8+ l as label,
9+ extract(j .name , ' [^,]*' ) as job_name, -- remove shard number and label from job names
1610 j .workflow_name ,
1711 j .conclusion ,
18- DATE_TRUNC(:granularity, PARSE_TIMESTAMP_ISO8601(j .started_at ))
19- AS bucket,
12+ toStartOfInterval(j .started_at , INTERVAL 1 DAY) AS bucket
2013 FROM
21- commons .workflow_job j
22- CROSS JOIN UNNEST(j .labels as label) as l
23- WHERE 1 = 1
24- AND j .labels is not NULL
25- AND j ._event_time > CURRENT_DATETIME() - DAYS(:days_ago)
14+ -- Deliberatly not adding FINAL to this workflow_job.
15+ -- Risks of not using it:
16+ -- - You may get duplicate records for rows that were updated corresponding to their
17+ -- before/after states, but as long as there’s some mechanism in the query to account
18+ -- for that it’s okay (we check for j.status = 'completed`).
19+ -- - In the worst case scenario, you may only see the ‘old’ version of the records for some rows
20+ -- Costs of using it:
21+ -- - Query procesing time increases from ~5 -> 16 seconds
22+ -- - Memory usage grows from 7.5 GB -> 32 GB
23+ -- So the tradeoff is worth it for this query.
24+ workflow_job as j
25+ ARRAY JOIN
26+ j .labels as l
27+ WHERE
28+ j .labels IS NOT NULL -- prob unnecessary now
29+ AND j .created_at > now() - interval {days_ago: Int64} day
2630 AND j .status = ' completed'
27- AND l . label != ' self-hosted'
28- AND l . label not like ' lf.c.%'
29- AND l . label not like ' %canary%'
30-
31- ), migrated_jobs AS (
31+ AND l != ' self-hosted'
32+ AND l NOT LIKE ' lf.c.%'
33+ AND l NOT LIKE ' %canary%'
34+ ),
35+ lf_jobs AS (
3236 SELECT DISTINCT
3337 j .job_name
3438 FROM
35- normalized_jobs j
36- WHERE 1 = 1
37- AND j .label like ' lf%'
38- ), comparable_jobs AS (
39- SELECT
40- -- count(*)
41- j .bucket ,
42- j .started_at ,
43- j .duration_min ,-- -- j.completed_at,
44- j .label ,
45- j .job_name , -- remove shard number and label from job names
46- j .workflow_name ,
47- j .conclusion ,
48- FROM
49- normalized_jobs j
50- CROSS JOIN migrated_jobs mj
51- WHERE 1 = 1
52- AND j .job_name = mj .job_name
53- -- AND STRPOS(j.name, mj.job_clean) > 0
54-
55- ), success_stats AS (
39+ normalized_jobs as j
40+ WHERE
41+ j .label LIKE ' lf%'
42+ ),
43+ comparable_jobs AS (
44+ SELECT
45+ j .bucket ,
46+ j .started_at ,
47+ j .duration_min ,
48+ j .label ,
49+ j .job_name ,
50+ j .workflow_name ,
51+ j .conclusion
52+ FROM
53+ normalized_jobs as j
54+ CROSS JOIN lf_jobs as lfj
55+ WHERE
56+ j .job_name = lfj .job_name
57+ ),
58+ success_stats AS (
5659 SELECT
5760 bucket,
5861 count (* ) as group_size,
5962 job_name,
6063 workflow_name,
6164 label,
62- IF(SUBSTR(label, 1 , 3 ) = ' lf.' , True, False ) as lf_fleet,
63- SUM (
64- CASE
65- WHEN conclusion = ' success' THEN 1
66- ELSE 0
67- END
68- ) * 100 / (COUNT_IF(conclusion != ' cancelled' ) + 1 ) as success_rate, -- plus one is to handle divide by zero errors
69- SUM (
70- CASE
71- WHEN conclusion = ' failure' THEN 1
72- ELSE 0
73- END
74- ) * 100 / (COUNT_IF(conclusion != ' cancelled' ) + 1 ) as failure_rate,
75- SUM (
76- CASE
77- WHEN conclusion = ' cancelled' THEN 1
78- ELSE 0
79- END
80- ) * 100 / COUNT (* ) as cancelled_rate, -- cancelled rate is calculated over all jobs
81- SUM (
82- CASE
83- WHEN conclusion = ' success' THEN 1
84- ELSE 0
85- END
86- ) as success_count,
87- SUM (
88- CASE
89- WHEN conclusion = ' failure' THEN 1
90- ELSE 0
91- END
92- ) as failure_count,
93- SUM (
94- CASE
95- WHEN conclusion = ' cancelled' THEN 1
96- ELSE 0
97- END
98- ) as cancelled_count,
99- COUNT (* ) as total_count,
100- SUM (
101- CASE
102- WHEN conclusion = ' success' THEN duration_min
103- ELSE 0
104- END
105- ) / COUNT (* ) as success_avg_duration,
106- SUM (
107- CASE
108- WHEN conclusion = ' failure' THEN duration_min
109- ELSE 0
110- END
111- ) / COUNT (* ) as failure_avg_duration,
112- SUM (
113- CASE
114- WHEN conclusion = ' cancelled' THEN duration_min
115- ELSE 0
116- END
117- ) / COUNT (* ) as cancelled_avg_duration,
118-
65+ if(startsWith(label, ' lf.' ), 1 , 0 ) as lf_fleet,
66+ countIf(conclusion = ' success' ) * 100 / (countIf(conclusion != ' cancelled' ) + 1 ) as success_rate, -- plus one is to handle divide by zero errors
67+ countIf(conclusion = ' failure' ) * 100 / (countIf(conclusion != ' cancelled' ) + 1 ) as failure_rate,
68+ countIf(conclusion = ' cancelled' ) * 100 / COUNT () as cancelled_rate,
69+ avgIf(duration_min, conclusion = ' success' ) as success_avg_duration,
70+ avgIf(duration_min, conclusion = ' failure' ) as failure_avg_duration,
71+ avgIf(duration_min, conclusion = ' cancelled' ) as cancelled_avg_duration
11972 FROM comparable_jobs
120- GROUP BY
121- bucket, job_name, workflow_name, label
122- ), comparison_stats AS (
123- SELECT
124- lf .bucket ,
125- lf .workflow_name ,
126- lf .job_name ,
127- lf .group_size as sample_size_lf,
128- m .group_size as sample_size_meta,
129- lf .success_rate - m .success_rate as success_rate_delta,
130- lf .failure_rate - m .failure_rate as failure_rate_delta,
131- lf .cancelled_rate - m .cancelled_rate as cancelled_rate_delta,
132- IF(m .success_avg_duration = 0 , 1 , ROUND(lf .success_avg_duration * 1 .0 / m .success_avg_duration , 2 )) as success_duration_increase_ratio,
133- FROM success_stats lf
134- INNER JOIN success_stats m on lf .bucket = m .bucket
135- WHERE 1 = 1
136- AND lf .job_name = m .job_name
137- AND lf .workflow_name = m .workflow_name
138- AND lf .lf_fleet = True
139- AND m .lf_fleet = False
140- AND lf .group_size > 3
141- AND m .group_size > 3
73+ GROUP BY bucket, job_name, workflow_name, label
74+ ),
75+ comparison_stats AS (
76+ SELECT
77+ lf .bucket ,
78+ lf .workflow_name ,
79+ lf .job_name ,
80+ lf .group_size as sample_size_lf,
81+ m .group_size as sample_size_meta,
82+ lf .success_rate - m .success_rate as success_rate_delta,
83+ lf .failure_rate - m .failure_rate as failure_rate_delta,
84+ lf .cancelled_rate - m .cancelled_rate as cancelled_rate_delta,
85+ if(m .success_avg_duration = 0 , 1 , round(lf .success_avg_duration / m .success_avg_duration , 2 )) as success_duration_increase_ratio
86+ FROM
87+ success_stats as lf
88+ JOIN
89+ success_stats as m
90+ ON
91+ lf .bucket = m .bucket
92+ AND lf .job_name = m .job_name
93+ AND lf .workflow_name = m .workflow_name
94+ WHERE
95+ lf .lf_fleet = 1
96+ AND m .lf_fleet = 0
97+ -- the group size limit reduces noise from low sample sizes
98+ AND lf .group_size > 3
99+ AND m .group_size > 3
142100)
143- SELECT * from comparison_stats
144- ORDER by bucket desc , job_name desc , success_rate_delta, workflow_name
101+ SELECT
102+ *
103+ FROM
104+ comparison_stats
105+ ORDER BY
106+ bucket DESC , job_name DESC , success_rate_delta, workflow_name
0 commit comments