chore: fix read_gbq_colab benchmark (#1872)

tswast · web-flow · commit bb981783d512 · 2025-06-30T17:02:28.000-05:00
* chore: fix `read_gbq_colab` benchmark

* Correct the table size to match actual percentiles.
* Only do sum() on numeric columns.

* fix for filter bench
diff --git a/scripts/create_read_gbq_colab_benchmark_tables.py b/scripts/create_read_gbq_colab_benchmark_tables.py
@@ -42,18 +42,6 @@
         17486432.0,
         1919625975.0,
     ],
-    "num_materialized_or_scanned_rows": [
-        0.0,
-        6.0,
-        100.0,
-        4955.0,
-        23108.0,
-        139504.0,
-        616341.0,
-        3855698.0,
-        83725698.0,
-        5991998082.0,
-    ],
     "avg_row_bytes": [
         0.00014346299635435792,
         0.005370969708923197,
@@ -524,10 +512,11 @@ def main():
         for i in range(num_percentiles):
             percentile = TABLE_STATS["percentile"][i]
             avg_row_bytes_raw = TABLE_STATS["avg_row_bytes"][i]
-            num_rows_raw = TABLE_STATS["num_materialized_or_scanned_rows"][i]
+            table_bytes_raw = TABLE_STATS["materialized_or_scanned_bytes"][i]
 
+            target_table_bytes = max(1, int(math.ceil(table_bytes_raw)))
             target_row_bytes = max(1, int(math.ceil(avg_row_bytes_raw)))
-            num_rows = max(1, int(math.ceil(num_rows_raw)))
+            num_rows = max(1, int(math.ceil(target_table_bytes / target_row_bytes)))
 
             table_name = f"percentile_{percentile:02d}"
             print(f"\n--- Processing Table: {table_name} ---")
diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py
@@ -44,7 +44,7 @@ def aggregate_output(
     df_aggregated = (
         df.assign(rounded=df[group_column].astype("Int64").round(-9))
         .groupby("rounded")
-        .sum()
+        .sum(numeric_only=True)
     )
 
     df_aggregated.shape
diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py
@@ -14,6 +14,7 @@
 import pathlib
 
 import benchmark.utils as utils
+import pytest
 
 import bigframes.session
 
@@ -35,8 +36,15 @@ def filter_output(
 
     # Simulate the user filtering by a column and visualizing those results
     df_filtered = df[df["col_bool_0"]]
-    df_filtered.shape
-    next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
+    rows, _ = df_filtered.shape
+
+    # It's possible we don't have any pages at all, since we filtered out all
+    # matching rows.
+    if rows == 0:
+        with pytest.raises(StopIteration):
+            next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
+    else:
+        next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
 
 
 if __name__ == "__main__":

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ def aggregate_output(`
`44`	`44`	`df_aggregated = (`
`45`	`45`	`df.assign(rounded=df[group_column].astype("Int64").round(-9))`
`46`	`46`	`.groupby("rounded")`
`47`		`- .sum()`
	`47`	`+ .sum(numeric_only=True)`
`48`	`48`	`)`
`49`	`49`
`50`	`50`	`df_aggregated.shape`