Fixes lf_rollover_percentage query (#6493)

jeanschmidt · web-flow · commit e2efc3869e4f · 2025-04-02T17:21:24.000+02:00
After working on #6489 I wanted to do the same fixes for `lf_rollover_percentage`. So check there for the description of the fixes. I noticed a problem: when there is no data for a period for either LF or Meta (a case that happens when no job is running in one of the clusters) it skip returning those times, so it is zero for both LF and Meta in the graph. The user can't know if this was period where 100% was in Meta or LF fleet, as both appear zeroed. There is no merge with itself strategy that can deal with this edge case, as it will always match at least the same row, so outer join does not have an effect. This is not a problem for the experiment query, where 0 means no job running in the experiment or no job running at all. So, I created two selects, one with LF rows and one with Meta rows, then outer joined them. This allows to have data for the period where one of the fleets are zero. But it does not allow to have the complementary graph (% at meta, % at lf). So to avoid overcomplicating this query with two merges and then combining both tables, I return only one of the %s. The choice for the LF fleet is given the title of the graph "Percentage of Jobs rolled over Linux Foundation". ![Screenshot 2025-04-02 at 17 17 36](https://github.com/user-attachments/assets/0e62d6bb-864b-42aa-906b-4446d347e5d9)
diff --git a/torchci/clickhouse_queries/lf_rollover_percentage/query.sql b/torchci/clickhouse_queries/lf_rollover_percentage/query.sql
@@ -4,7 +4,7 @@ WITH
             l AS label,
             extract(j.name, '[^,]*') AS job_name, -- Remove shard number and label from job names
             j.workflow_name,
-            toStartOfInterval(j.started_at, INTERVAL 1 HOUR) AS bucket
+            toStartOfInterval(j.created_at, INTERVAL 1 HOUR) AS bucket
         FROM
             -- Deliberatly not adding FINAL to this workflow_job.
             -- Risks of not using it:
@@ -32,16 +32,13 @@ WITH
         FROM
             normalized_jobs AS j
         WHERE
-            j.label LIKE 'lf%'
+            j.label LIKE 'lf.%'
     ),
-    -- filter jobs down to the ones that ran in both
-    -- LF and Meta fleets
     comparable_jobs AS (
         SELECT
             j.bucket,
             j.label,
             j.job_name,
-            -- Remove shard number and label from job names
             j.workflow_name
         FROM
             normalized_jobs AS j
@@ -50,42 +47,53 @@ WITH
     ),
     success_stats AS (
         SELECT
-            bucket,
             count(*) AS group_size,
-            job_name,
-            workflow_name,
-            label,
+            bucket,
+            replaceOne(label, 'lf.', '') AS label_ref,
             if(substring(label, 1, 3) = 'lf.', True, False) AS lf_fleet
         FROM
             comparable_jobs
         GROUP BY
-            bucket, job_name, workflow_name, label
+            bucket, label_ref, lf_fleet
+    ),
+    lf_success_stats AS (
+        SELECT
+            *
+        FROM
+            success_stats
+        WHERE
+            lf_fleet = True
+    ),
+    meta_success_stats AS (
+        SELECT
+            *
+        FROM
+            success_stats
+        WHERE
+            lf_fleet = False
     ),
     comparison_stats AS (
         SELECT
-            lf.bucket,
-            SUM(lf.group_size + m.group_size) AS total_jobs,
-            SUM(m.group_size) AS compliment_jobs,
-            SUM(lf.group_size) AS counted_jobs,
-            m.lf_fleet AS c_fleet,
-            lf.lf_fleet AS m_fleet,
+            -- *
+            greatest(lf.bucket, m.bucket) AS bucket,
             CAST(SUM(lf.group_size) AS Float32) / SUM(lf.group_size + m.group_size) * 100 AS percentage,
-            IF(lf.lf_fleet, 'Linux Foundation', 'Meta') AS fleet
+            -- IF(lf.lf_fleet, 'Linux Foundation', 'Meta') AS fleet
+            'Linux Fundation' AS fleet
         FROM
-            success_stats AS lf
-        INNER JOIN
-            success_stats AS m ON lf.bucket = m.bucket
-        WHERE
-            lf.job_name = m.job_name
-            AND lf.workflow_name = m.workflow_name
-            AND (
-                (lf.lf_fleet = 1 AND m.lf_fleet = 0)
-                OR (lf.lf_fleet = 0 AND m.lf_fleet = 1)
-            )
-            AND lf.group_size > 3
-            AND m.group_size > 3
+            lf_success_stats AS lf
+        FULL OUTER JOIN
+            meta_success_stats AS m
+        ON
+            lf.label_ref = m.label_ref
+            AND lf.bucket = m.bucket
         GROUP BY
-            lf.bucket, lf.lf_fleet, m.lf_fleet
+            bucket
     )
-SELECT * FROM comparison_stats
-ORDER BY  bucket DESC, fleet
+SELECT
+    bucket,
+    fleet,
+    avg(percentage) OVER (ORDER BY bucket DESC ROWS BETWEEN 5 PRECEDING AND CURRENT ROW) AS percentage
+FROM
+    comparison_stats
+ORDER BY
+    bucket DESC