Skip to content

Commit ed75cd9

Browse files
authored
chore: add benchmarks for read_gbq_colab (#1860)
* chore: add benchmarks for read_gbq_colab * correct project id * exclude error too * Delete tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error * explain column selection for groupby
1 parent c5d251a commit ed75cd9

File tree

9 files changed

+370
-0
lines changed

9 files changed

+370
-0
lines changed

tests/benchmark/.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
*.bytesprocessed
2+
*.bq_exec_time_seconds
3+
*.error
4+
*.local_exec_time_seconds
5+
*.query_char_count
6+
*.slotmillis
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import pathlib
15+
16+
import benchmark.utils as utils
17+
18+
import bigframes.session
19+
20+
PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
21+
22+
23+
def aggregate_output(
24+
*, project_id, dataset_id, table_id, session: bigframes.session.Session
25+
):
26+
# TODO(tswast): Support alternative query if table_id is a local DataFrame,
27+
# e.g. "{local_inline}" or "{local_large}"
28+
df = session._read_gbq_colab(
29+
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
30+
)
31+
32+
# Simulate getting the first page, since we'll always do that first in the UI.
33+
df.shape
34+
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
35+
36+
# To simulate very small rows that can only fit a boolean,
37+
# some tables don't have an integer column. If an integer column is available,
38+
# we prefer to group by that to get a more realistic number of groups.
39+
group_column = "col_int64_1"
40+
if group_column not in df.columns:
41+
group_column = "col_bool_0"
42+
43+
# Simulate the user aggregating by a column and visualizing those results
44+
df_aggregated = (
45+
df.assign(rounded=df[group_column].astype("Int64").round(-9))
46+
.groupby("rounded")
47+
.sum()
48+
)
49+
50+
df_aggregated.shape
51+
next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)))
52+
53+
54+
if __name__ == "__main__":
55+
(
56+
project_id,
57+
dataset_id,
58+
table_id,
59+
session,
60+
suffix,
61+
) = utils.get_configuration(include_table_id=True)
62+
current_path = pathlib.Path(__file__).absolute()
63+
64+
utils.get_execution_time(
65+
aggregate_output,
66+
current_path,
67+
suffix,
68+
project_id=project_id,
69+
dataset_id=dataset_id,
70+
table_id=table_id,
71+
session=session,
72+
)
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{"benchmark_suffix": "percentile_09", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_09", "ordered": false}
2+
{"benchmark_suffix": "percentile_19", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_19", "ordered": false}
3+
{"benchmark_suffix": "percentile_29", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_29", "ordered": false}
4+
{"benchmark_suffix": "percentile_39", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_39", "ordered": false}
5+
{"benchmark_suffix": "percentile_49", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_49", "ordered": false}
6+
{"benchmark_suffix": "percentile_59", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_59", "ordered": false}
7+
{"benchmark_suffix": "percentile_69", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_69", "ordered": false}
8+
{"benchmark_suffix": "percentile_79", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_79", "ordered": false}
9+
{"benchmark_suffix": "percentile_89", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_89", "ordered": false}
10+
{"benchmark_suffix": "percentile_99", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_99", "ordered": false}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import pathlib
15+
16+
import benchmark.utils as utils
17+
18+
import bigframes.session
19+
20+
21+
def dry_run(*, project_id, dataset_id, table_id, session: bigframes.session.Session):
22+
# TODO(tswast): Support alternative query if table_id is a local DataFrame,
23+
# e.g. "{local_inline}" or "{local_large}"
24+
session._read_gbq_colab(
25+
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}",
26+
dry_run=True,
27+
)
28+
29+
30+
if __name__ == "__main__":
31+
(
32+
project_id,
33+
dataset_id,
34+
table_id,
35+
session,
36+
suffix,
37+
) = utils.get_configuration(include_table_id=True)
38+
current_path = pathlib.Path(__file__).absolute()
39+
40+
utils.get_execution_time(
41+
dry_run,
42+
current_path,
43+
suffix,
44+
project_id=project_id,
45+
dataset_id=dataset_id,
46+
table_id=table_id,
47+
session=session,
48+
)
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import pathlib
15+
16+
import benchmark.utils as utils
17+
18+
import bigframes.session
19+
20+
PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
21+
22+
23+
def filter_output(
24+
*, project_id, dataset_id, table_id, session: bigframes.session.Session
25+
):
26+
# TODO(tswast): Support alternative query if table_id is a local DataFrame,
27+
# e.g. "{local_inline}" or "{local_large}"
28+
df = session._read_gbq_colab(
29+
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
30+
)
31+
32+
# Simulate getting the first page, since we'll always do that first in the UI.
33+
df.shape
34+
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
35+
36+
# Simulate the user filtering by a column and visualizing those results
37+
df_filtered = df[df["col_bool_0"]]
38+
df_filtered.shape
39+
next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
40+
41+
42+
if __name__ == "__main__":
43+
(
44+
project_id,
45+
dataset_id,
46+
table_id,
47+
session,
48+
suffix,
49+
) = utils.get_configuration(include_table_id=True)
50+
current_path = pathlib.Path(__file__).absolute()
51+
52+
utils.get_execution_time(
53+
filter_output,
54+
current_path,
55+
suffix,
56+
project_id=project_id,
57+
dataset_id=dataset_id,
58+
table_id=table_id,
59+
session=session,
60+
)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import pathlib
15+
16+
import benchmark.utils as utils
17+
18+
import bigframes.session
19+
20+
PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
21+
22+
23+
def first_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session):
24+
# TODO(tswast): Support alternative query if table_id is a local DataFrame,
25+
# e.g. "{local_inline}" or "{local_large}"
26+
df = session._read_gbq_colab(
27+
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
28+
)
29+
30+
# Get number of rows (to calculate number of pages) and the first page.
31+
df.shape
32+
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
33+
34+
35+
if __name__ == "__main__":
36+
(
37+
project_id,
38+
dataset_id,
39+
table_id,
40+
session,
41+
suffix,
42+
) = utils.get_configuration(include_table_id=True)
43+
current_path = pathlib.Path(__file__).absolute()
44+
45+
utils.get_execution_time(
46+
first_page,
47+
current_path,
48+
suffix,
49+
project_id=project_id,
50+
dataset_id=dataset_id,
51+
table_id=table_id,
52+
session=session,
53+
)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import pathlib
15+
16+
import benchmark.utils as utils
17+
18+
import bigframes.session
19+
20+
PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
21+
22+
23+
def last_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session):
24+
# TODO(tswast): Support alternative query if table_id is a local DataFrame,
25+
# e.g. "{local_inline}" or "{local_large}"
26+
df = session._read_gbq_colab(
27+
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
28+
)
29+
30+
# Get number of rows (to calculate number of pages) and then all pages.
31+
df.shape
32+
for _ in df.to_pandas_batches(page_size=PAGE_SIZE):
33+
pass
34+
35+
36+
if __name__ == "__main__":
37+
(
38+
project_id,
39+
dataset_id,
40+
table_id,
41+
session,
42+
suffix,
43+
) = utils.get_configuration(include_table_id=True)
44+
current_path = pathlib.Path(__file__).absolute()
45+
46+
utils.get_execution_time(
47+
last_page,
48+
current_path,
49+
suffix,
50+
project_id=project_id,
51+
dataset_id=dataset_id,
52+
table_id=table_id,
53+
session=session,
54+
)
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import pathlib
15+
16+
import benchmark.utils as utils
17+
18+
import bigframes.session
19+
20+
PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
21+
22+
23+
def sort_output(
24+
*, project_id, dataset_id, table_id, session: bigframes.session.Session
25+
):
26+
# TODO(tswast): Support alternative query if table_id is a local DataFrame,
27+
# e.g. "{local_inline}" or "{local_large}"
28+
df = session._read_gbq_colab(
29+
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
30+
)
31+
32+
# Simulate getting the first page, since we'll always do that first in the UI.
33+
df.shape
34+
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
35+
36+
# Simulate the user sorting by a column and visualizing those results
37+
sort_column = "col_int64_1"
38+
if sort_column not in df.columns:
39+
sort_column = "col_bool_0"
40+
41+
df_sorted = df.sort_values(sort_column)
42+
df_sorted.shape
43+
next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE)))
44+
45+
46+
if __name__ == "__main__":
47+
(
48+
project_id,
49+
dataset_id,
50+
table_id,
51+
session,
52+
suffix,
53+
) = utils.get_configuration(include_table_id=True)
54+
current_path = pathlib.Path(__file__).absolute()
55+
56+
utils.get_execution_time(
57+
sort_output,
58+
current_path,
59+
suffix,
60+
project_id=project_id,
61+
dataset_id=dataset_id,
62+
table_id=table_id,
63+
session=session,
64+
)

tests/benchmark/utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
import bigframes
1919

20+
READ_GBQ_COLAB_PAGE_SIZE = 100
21+
2022

2123
def get_configuration(include_table_id=False):
2224
parser = argparse.ArgumentParser()
@@ -94,6 +96,7 @@ def _str_to_bool(value):
9496

9597

9698
def _initialize_session(ordered: bool):
99+
# TODO(tswast): add a flag to enable the polars semi-executor.
97100
context = bigframes.BigQueryOptions(
98101
location="US", ordering_mode="strict" if ordered else "partial"
99102
)

0 commit comments

Comments
 (0)