fix: dedup by computed_name

joseph-sentry · joseph-sentry · commit dc40de8de094 · 2025-03-05T16:33:30.000-05:00
diff --git a/graphql_api/tests/test_test_analytics.py b/graphql_api/tests/test_test_analytics.py
@@ -75,6 +75,53 @@ def mock_storage(mocker):
 rows = [RowFactory()(datetime.datetime(2024, 1, 1 + i)) for i in range(5)]
 
 
+rows_with_duplicate_names = [
+    RowFactory()(datetime.datetime(2024, 1, 1 + i)) for i in range(5)
+]
+for i in range(0, len(rows_with_duplicate_names) - 1, 2):
+    rows_with_duplicate_names[i]["name"] = rows_with_duplicate_names[i + 1]["name"]
+
+
+def dedup(rows: list[dict]) -> list[dict]:
+    by_name = {}
+    for row in rows:
+        if row["name"] not in by_name:
+            by_name[row["name"]] = []
+        by_name[row["name"]].append(row)
+
+    result = []
+    for name, group in by_name.items():
+        if len(group) == 1:
+            result.append(group[0])
+            continue
+
+        weights = [r["total_pass_count"] + r["total_fail_count"] for r in group]
+        total_weight = sum(weights)
+
+        merged = {
+            "name": name,
+            "testsuite": sorted({r["testsuite"] for r in group}),
+            "flags": sorted({flag for r in group for flag in r["flags"]}),
+            "test_id": group[0]["test_id"],  # Keep first test_id
+            "failure_rate": sum(r["failure_rate"] * w for r, w in zip(group, weights))
+            / total_weight,
+            "flake_rate": sum(r["flake_rate"] * w for r, w in zip(group, weights))
+            / total_weight,
+            "updated_at": max(r["updated_at"] for r in group),
+            "avg_duration": sum(r["avg_duration"] * w for r, w in zip(group, weights))
+            / total_weight,
+            "total_fail_count": sum(r["total_fail_count"] for r in group),
+            "total_flaky_fail_count": sum(r["total_flaky_fail_count"] for r in group),
+            "total_pass_count": sum(r["total_pass_count"] for r in group),
+            "total_skip_count": sum(r["total_skip_count"] for r in group),
+            "commits_where_fail": sum(r["commits_where_fail"] for r in group),
+            "last_duration": max(r["last_duration"] for r in group),
+        }
+        result.append(merged)
+
+    return sorted(result, key=lambda x: x["updated_at"], reverse=True)
+
+
 def row_to_camel_case(row: dict) -> dict:
     return {
         "commitsFailed"
@@ -89,6 +136,7 @@ def row_to_camel_case(row: dict) -> dict:
 
 
 test_results_table = pl.DataFrame(rows)
+test_results_table_with_duplicate_names = pl.DataFrame(rows_with_duplicate_names)
 
 
 def base64_encode_string(x: str) -> str:
@@ -143,6 +191,21 @@ def store_in_storage(repository, mock_storage):
     )
 
 
+@pytest.fixture
+def store_in_redis_with_duplicate_names(repository):
+    redis = get_redis_connection()
+    redis.set(
+        f"test_results:{repository.repoid}:{repository.branch}:30",
+        test_results_table_with_duplicate_names.write_ipc(None).getvalue(),
+    )
+
+    yield
+
+    redis.delete(
+        f"test_results:{repository.repoid}:{repository.branch}:30",
+    )
+
+
 class TestAnalyticsTestCase(
     GraphQLTestHelper,
 ):
@@ -583,6 +646,51 @@ def test_gql_query(self, repository, store_in_redis, mock_storage):
             for row in reversed(rows)
         ]
 
+    def test_gql_query_with_duplicate_names(
+        self, repository, store_in_redis_with_duplicate_names, mock_storage
+    ):
+        query = base_gql_query % (
+            repository.author.username,
+            repository.name,
+            """
+            testResults(ordering: { parameter: UPDATED_AT, direction: DESC } ) {
+                totalCount
+                edges {
+                    cursor
+                    node {
+                        name
+                        failureRate
+                        flakeRate
+                        updatedAt
+                        avgDuration
+                        totalFailCount
+                        totalFlakyFailCount
+                        totalPassCount
+                        totalSkipCount
+                        commitsFailed
+                        lastDuration
+                    }
+                }
+            }
+            """,
+        )
+
+        result = self.gql_request(query, owner=repository.author)
+
+        assert (
+            result["owner"]["repository"]["testAnalytics"]["testResults"]["totalCount"]
+            == 3
+        )
+        assert result["owner"]["repository"]["testAnalytics"]["testResults"][
+            "edges"
+        ] == [
+            {
+                "cursor": cursor(row),
+                "node": row_to_camel_case(row),
+            }
+            for row in dedup(rows_with_duplicate_names)
+        ]
+
     def test_gql_query_aggregates(self, repository, store_in_redis, mock_storage):
         query = base_gql_query % (
             repository.author.username,
diff --git a/graphql_api/types/test_analytics/test_analytics.py b/graphql_api/types/test_analytics/test_analytics.py
@@ -197,6 +197,38 @@ def generate_test_results(
             },
         )
 
+    failure_rate_expr = (
+        pl.col("failure_rate")
+        * (pl.col("total_fail_count") + pl.col("total_pass_count"))
+    ).sum() / (pl.col("total_fail_count") + pl.col("total_pass_count")).sum()
+
+    flake_rate_expr = (
+        pl.col("flake_rate") * (pl.col("total_fail_count") + pl.col("total_pass_count"))
+    ).sum() / (pl.col("total_fail_count") + pl.col("total_pass_count")).sum()
+
+    # dedup
+    table = table.group_by("name").agg(
+        pl.col("test_id").first().alias("test_id"),
+        pl.col("testsuite").implode().alias("testsuite"),
+        pl.col("flags").explode().unique().alias("flags"),
+        failure_rate_expr.alias("failure_rate"),
+        flake_rate_expr.alias("flake_rate"),
+        pl.col("updated_at").max().alias("updated_at"),
+        (
+            (
+                pl.col("avg_duration")
+                * (pl.col("total_pass_count") + pl.col("total_fail_count"))
+            ).sum()
+            / (pl.col("total_pass_count") + pl.col("total_fail_count")).sum()
+        ).alias("avg_duration"),
+        pl.col("total_fail_count").sum().alias("total_fail_count"),
+        pl.col("total_flaky_fail_count").sum().alias("total_flaky_fail_count"),
+        pl.col("total_pass_count").sum().alias("total_pass_count"),
+        pl.col("total_skip_count").sum().alias("total_skip_count"),
+        pl.col("commits_where_fail").sum().alias("commits_where_fail"),
+        pl.col("last_duration").max().alias("last_duration"),
+    )
+
     if term:
         table = table.filter(pl.col("name").str.contains(term))