chore: add benchmarks for read_gbq_colab (#1860)

tswast · web-flow · commit ed75cd99ff37 · 2025-06-27T14:24:46.000-07:00
* chore: add benchmarks for read_gbq_colab

* correct project id

* exclude error too

* Delete tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error

* explain column selection for groupby
diff --git a/tests/benchmark/.gitignore b/tests/benchmark/.gitignore
@@ -0,0 +1,6 @@
+*.bytesprocessed
+*.bq_exec_time_seconds
+*.error
+*.local_exec_time_seconds
+*.query_char_count
+*.slotmillis
diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py
@@ -0,0 +1,72 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+
+import bigframes.session
+
+PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
+
+
+def aggregate_output(
+    *, project_id, dataset_id, table_id, session: bigframes.session.Session
+):
+    # TODO(tswast): Support alternative query if table_id is a local DataFrame,
+    # e.g. "{local_inline}" or "{local_large}"
+    df = session._read_gbq_colab(
+        f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
+    )
+
+    # Simulate getting the first page, since we'll always do that first in the UI.
+    df.shape
+    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+
+    # To simulate very small rows that can only fit a boolean,
+    # some tables don't have an integer column. If an integer column is available,
+    # we prefer to group by that to get a more realistic number of groups.
+    group_column = "col_int64_1"
+    if group_column not in df.columns:
+        group_column = "col_bool_0"
+
+    # Simulate the user aggregating by a column and visualizing those results
+    df_aggregated = (
+        df.assign(rounded=df[group_column].astype("Int64").round(-9))
+        .groupby("rounded")
+        .sum()
+    )
+
+    df_aggregated.shape
+    next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)))
+
+
+if __name__ == "__main__":
+    (
+        project_id,
+        dataset_id,
+        table_id,
+        session,
+        suffix,
+    ) = utils.get_configuration(include_table_id=True)
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        aggregate_output,
+        current_path,
+        suffix,
+        project_id=project_id,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        session=session,
+    )
diff --git a/tests/benchmark/read_gbq_colab/config.jsonl b/tests/benchmark/read_gbq_colab/config.jsonl
@@ -0,0 +1,10 @@
+{"benchmark_suffix": "percentile_09", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_09", "ordered": false}
+{"benchmark_suffix": "percentile_19", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_19", "ordered": false}
+{"benchmark_suffix": "percentile_29", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_29", "ordered": false}
+{"benchmark_suffix": "percentile_39", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_39", "ordered": false}
+{"benchmark_suffix": "percentile_49", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_49", "ordered": false}
+{"benchmark_suffix": "percentile_59", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_59", "ordered": false}
+{"benchmark_suffix": "percentile_69", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_69", "ordered": false}
+{"benchmark_suffix": "percentile_79", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_79", "ordered": false}
+{"benchmark_suffix": "percentile_89", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_89", "ordered": false}
+{"benchmark_suffix": "percentile_99", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_99", "ordered": false}
diff --git a/tests/benchmark/read_gbq_colab/dry_run.py b/tests/benchmark/read_gbq_colab/dry_run.py
@@ -0,0 +1,48 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+
+import bigframes.session
+
+
+def dry_run(*, project_id, dataset_id, table_id, session: bigframes.session.Session):
+    # TODO(tswast): Support alternative query if table_id is a local DataFrame,
+    # e.g. "{local_inline}" or "{local_large}"
+    session._read_gbq_colab(
+        f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}",
+        dry_run=True,
+    )
+
+
+if __name__ == "__main__":
+    (
+        project_id,
+        dataset_id,
+        table_id,
+        session,
+        suffix,
+    ) = utils.get_configuration(include_table_id=True)
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        dry_run,
+        current_path,
+        suffix,
+        project_id=project_id,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        session=session,
+    )
diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py
@@ -0,0 +1,60 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+
+import bigframes.session
+
+PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
+
+
+def filter_output(
+    *, project_id, dataset_id, table_id, session: bigframes.session.Session
+):
+    # TODO(tswast): Support alternative query if table_id is a local DataFrame,
+    # e.g. "{local_inline}" or "{local_large}"
+    df = session._read_gbq_colab(
+        f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
+    )
+
+    # Simulate getting the first page, since we'll always do that first in the UI.
+    df.shape
+    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+
+    # Simulate the user filtering by a column and visualizing those results
+    df_filtered = df[df["col_bool_0"]]
+    df_filtered.shape
+    next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
+
+
+if __name__ == "__main__":
+    (
+        project_id,
+        dataset_id,
+        table_id,
+        session,
+        suffix,
+    ) = utils.get_configuration(include_table_id=True)
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        filter_output,
+        current_path,
+        suffix,
+        project_id=project_id,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        session=session,
+    )
diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py
@@ -0,0 +1,53 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+
+import bigframes.session
+
+PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
+
+
+def first_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session):
+    # TODO(tswast): Support alternative query if table_id is a local DataFrame,
+    # e.g. "{local_inline}" or "{local_large}"
+    df = session._read_gbq_colab(
+        f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
+    )
+
+    # Get number of rows (to calculate number of pages) and the first page.
+    df.shape
+    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+
+
+if __name__ == "__main__":
+    (
+        project_id,
+        dataset_id,
+        table_id,
+        session,
+        suffix,
+    ) = utils.get_configuration(include_table_id=True)
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        first_page,
+        current_path,
+        suffix,
+        project_id=project_id,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        session=session,
+    )
diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py
@@ -0,0 +1,54 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+
+import bigframes.session
+
+PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
+
+
+def last_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session):
+    # TODO(tswast): Support alternative query if table_id is a local DataFrame,
+    # e.g. "{local_inline}" or "{local_large}"
+    df = session._read_gbq_colab(
+        f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
+    )
+
+    # Get number of rows (to calculate number of pages) and then all pages.
+    df.shape
+    for _ in df.to_pandas_batches(page_size=PAGE_SIZE):
+        pass
+
+
+if __name__ == "__main__":
+    (
+        project_id,
+        dataset_id,
+        table_id,
+        session,
+        suffix,
+    ) = utils.get_configuration(include_table_id=True)
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        last_page,
+        current_path,
+        suffix,
+        project_id=project_id,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        session=session,
+    )
diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py
@@ -0,0 +1,64 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+
+import bigframes.session
+
+PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
+
+
+def sort_output(
+    *, project_id, dataset_id, table_id, session: bigframes.session.Session
+):
+    # TODO(tswast): Support alternative query if table_id is a local DataFrame,
+    # e.g. "{local_inline}" or "{local_large}"
+    df = session._read_gbq_colab(
+        f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
+    )
+
+    # Simulate getting the first page, since we'll always do that first in the UI.
+    df.shape
+    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+
+    # Simulate the user sorting by a column and visualizing those results
+    sort_column = "col_int64_1"
+    if sort_column not in df.columns:
+        sort_column = "col_bool_0"
+
+    df_sorted = df.sort_values(sort_column)
+    df_sorted.shape
+    next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE)))
+
+
+if __name__ == "__main__":
+    (
+        project_id,
+        dataset_id,
+        table_id,
+        session,
+        suffix,
+    ) = utils.get_configuration(include_table_id=True)
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        sort_output,
+        current_path,
+        suffix,
+        project_id=project_id,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        session=session,
+    )
diff --git a/tests/benchmark/utils.py b/tests/benchmark/utils.py
@@ -17,6 +17,8 @@
 
 import bigframes
 
+READ_GBQ_COLAB_PAGE_SIZE = 100
+
 
 def get_configuration(include_table_id=False):
     parser = argparse.ArgumentParser()
@@ -94,6 +96,7 @@ def _str_to_bool(value):
 
 
 def _initialize_session(ordered: bool):
+    # TODO(tswast): add a flag to enable the polars semi-executor.
     context = bigframes.BigQueryOptions(
         location="US", ordering_mode="strict" if ordered else "partial"
     )