optimize groupsummary/ api, also reduce intermediate datastructure size. (#8904)

jmaher · web-flow · commit df2ef70982ec · 2025-08-25T09:26:46.000-07:00
diff --git a/treeherder/webapp/api/groups.py b/treeherder/webapp/api/groups.py
@@ -42,62 +42,75 @@ def list(self, request):
         if (enddate - startdate).days > 1:
             enddate = startdate + datetime.timedelta(days=1)
 
-        q = (
-            Job.objects.filter(push__time__range=(startdate.date(), enddate.date()))
-            .filter(repository_id__in=(1, 77))
-            .values(
-                "job_log__groups__name",
-                "job_type__name",
-                "job_log__group_result__status",
-                "failure_classification_id",
-            )
-            .annotate(job_count=Count("id"))
-            .order_by("job_log__groups__name")
-        )
-        self.queryset = q
-        serializer = self.get_serializer(self.queryset, many=True)
         summary = {}
-        job_type_names = []
-        for item in serializer.data:
-            if not item["group_name"] or not item["job_type_name"]:
-                continue
-
-            if not item["job_type_name"].startswith("test-"):
-                continue
-
-            if int(item["group_status"]) == 1:  # ok
-                result = "passed"
-            elif int(item["group_status"]) == 2:  # testfailed
-                result = "testfailed"
-            else:
-                # other: 3 (skipped), 10 (unsupported (i.e. crashed))
-                # we don't want to count this at all
-                continue
-
-            # TODO: consider stripping out some types; mostly care about FBC vs Intermittent
-            classification = item["failure_classification"]
-
-            if item["job_type_name"] not in job_type_names:
-                job_type_names.append(item["job_type_name"])
-            if item["group_name"] not in summary:
-                summary[item["group_name"]] = {}
-            if item["job_type_name"] not in summary[item["group_name"]]:
-                summary[item["group_name"]][item["job_type_name"]] = {}
-            if result not in summary[item["group_name"]][item["job_type_name"]]:
-                summary[item["group_name"]][item["job_type_name"]][result] = {}
-            if classification not in summary[item["group_name"]][item["job_type_name"]][result]:
-                summary[item["group_name"]][item["job_type_name"]][result][classification] = 0
-            summary[item["group_name"]][item["job_type_name"]][result][classification] += item[
-                "job_count"
-            ]
-
-        data = {"job_type_names": job_type_names, "manifests": []}
+        job_type_names = {}
+        job_type_counter = 0
+
+        for platform in ["windows", "linux", "macosx", "android"]:
+            q = (
+                Job.objects.filter(
+                    push__time__range=(startdate.date(), enddate.date()),
+                    repository_id__in=(1, 77),
+                    job_type__name__startswith=f"test-{platform}",  # Filter at DB level
+                    job_log__group_result__status__in=(1, 2),  # Only OK and ERROR statuses
+                )
+                .select_related("job_type", "job_log")  # Reduce queries
+                .values(
+                    "job_log__groups__name",
+                    "job_type__name",
+                    "job_log__group_result__status",
+                    "failure_classification_id",
+                )
+                .annotate(job_count=Count("id"))
+                .order_by("job_log__groups__name")
+            )
+
+            self.queryset = q
+            serializer = self.get_serializer(self.queryset, many=True)
+            for item in serializer.data:
+                group_name = item["group_name"]
+                job_type_name = item["job_type_name"]
+                group_status = int(item["group_status"])
+                classification = item["failure_classification"]
+                job_count = item["job_count"]
+
+                if not group_name or not job_type_name:
+                    continue
+
+                # serialize job_type_name (remove chunk number)
+                parts = job_type_name.split("-")
+                try:
+                    _ = int(parts[-1])
+                    job_type_name = "-".join(parts[:-1])
+                except ValueError:
+                    pass
+
+                result = "passed" if group_status == 1 else "testfailed"
+
+                if job_type_name not in job_type_names:
+                    job_type_names[job_type_name] = job_type_counter
+                    jt_index = job_type_counter
+                    job_type_counter += 1
+                else:
+                    jt_index = job_type_names[job_type_name]
+
+                if group_name not in summary:
+                    summary[group_name] = {}
+                if jt_index not in summary[group_name]:
+                    summary[group_name][jt_index] = {}
+                if result not in summary[group_name][jt_index]:
+                    summary[group_name][jt_index][result] = {}
+                if classification not in summary[group_name][jt_index][result]:
+                    summary[group_name][jt_index][result][classification] = 0
+                summary[group_name][jt_index][result][classification] += job_count
+
+        data = {"job_type_names": job_type_names.keys(), "manifests": []}
         for m in summary.keys():
             mdata = []
             for d in summary[m]:
                 for r in summary[m][d]:
                     for c in summary[m][d][r]:
-                        mdata.append([job_type_names.index(d), r, int(c), summary[m][d][r][c]])
+                        mdata.append([d, r, int(c), summary[m][d][r][c]])
             data["manifests"].append({m: mdata})
 
         return Response(data=data)