Skip to content

Commit bb98178

Browse files
authored
chore: fix read_gbq_colab benchmark (#1872)
* chore: fix `read_gbq_colab` benchmark * Correct the table size to match actual percentiles. * Only do sum() on numeric columns. * fix for filter bench
1 parent 81e4d64 commit bb98178

File tree

3 files changed

+14
-17
lines changed

3 files changed

+14
-17
lines changed

scripts/create_read_gbq_colab_benchmark_tables.py

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -42,18 +42,6 @@
4242
17486432.0,
4343
1919625975.0,
4444
],
45-
"num_materialized_or_scanned_rows": [
46-
0.0,
47-
6.0,
48-
100.0,
49-
4955.0,
50-
23108.0,
51-
139504.0,
52-
616341.0,
53-
3855698.0,
54-
83725698.0,
55-
5991998082.0,
56-
],
5745
"avg_row_bytes": [
5846
0.00014346299635435792,
5947
0.005370969708923197,
@@ -524,10 +512,11 @@ def main():
524512
for i in range(num_percentiles):
525513
percentile = TABLE_STATS["percentile"][i]
526514
avg_row_bytes_raw = TABLE_STATS["avg_row_bytes"][i]
527-
num_rows_raw = TABLE_STATS["num_materialized_or_scanned_rows"][i]
515+
table_bytes_raw = TABLE_STATS["materialized_or_scanned_bytes"][i]
528516

517+
target_table_bytes = max(1, int(math.ceil(table_bytes_raw)))
529518
target_row_bytes = max(1, int(math.ceil(avg_row_bytes_raw)))
530-
num_rows = max(1, int(math.ceil(num_rows_raw)))
519+
num_rows = max(1, int(math.ceil(target_table_bytes / target_row_bytes)))
531520

532521
table_name = f"percentile_{percentile:02d}"
533522
print(f"\n--- Processing Table: {table_name} ---")

tests/benchmark/read_gbq_colab/aggregate_output.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def aggregate_output(
4444
df_aggregated = (
4545
df.assign(rounded=df[group_column].astype("Int64").round(-9))
4646
.groupby("rounded")
47-
.sum()
47+
.sum(numeric_only=True)
4848
)
4949

5050
df_aggregated.shape

tests/benchmark/read_gbq_colab/filter_output.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import pathlib
1515

1616
import benchmark.utils as utils
17+
import pytest
1718

1819
import bigframes.session
1920

@@ -35,8 +36,15 @@ def filter_output(
3536

3637
# Simulate the user filtering by a column and visualizing those results
3738
df_filtered = df[df["col_bool_0"]]
38-
df_filtered.shape
39-
next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
39+
rows, _ = df_filtered.shape
40+
41+
# It's possible we don't have any pages at all, since we filtered out all
42+
# matching rows.
43+
if rows == 0:
44+
with pytest.raises(StopIteration):
45+
next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
46+
else:
47+
next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
4048

4149

4250
if __name__ == "__main__":

0 commit comments

Comments
 (0)