Skip to content

Commit 0a44e84

Browse files
authored
chore: create DF._to_pandas_batches() for better type checking of PandasBatches` (#2178)
1 parent fbd405d commit 0a44e84

File tree

8 files changed

+45
-21
lines changed

8 files changed

+45
-21
lines changed

bigframes/core/blocks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,7 @@ def to_pandas_batches(
693693
page_size: Optional[int] = None,
694694
max_results: Optional[int] = None,
695695
allow_large_results: Optional[bool] = None,
696-
) -> Iterator[pd.DataFrame]:
696+
) -> PandasBatches:
697697
"""Download results one message at a time.
698698
699699
page_size and max_results determine the size and number of batches,

bigframes/dataframe.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1930,6 +1930,19 @@ def to_pandas_batches(
19301930
form the original dataframe. Results stream from bigquery,
19311931
see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable
19321932
"""
1933+
return self._to_pandas_batches(
1934+
page_size=page_size,
1935+
max_results=max_results,
1936+
allow_large_results=allow_large_results,
1937+
)
1938+
1939+
def _to_pandas_batches(
1940+
self,
1941+
page_size: Optional[int] = None,
1942+
max_results: Optional[int] = None,
1943+
*,
1944+
allow_large_results: Optional[bool] = None,
1945+
) -> blocks.PandasBatches:
19331946
return self._block.to_pandas_batches(
19341947
page_size=page_size,
19351948
max_results=max_results,

bigframes/display/anywidget.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import pandas as pd
2424

2525
import bigframes
26+
import bigframes.dataframe
2627
import bigframes.display.html
2728

2829
# anywidget and traitlets are optional dependencies. We don't want the import of this
@@ -73,7 +74,7 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
7374
initial_page_size = bigframes.options.display.max_rows
7475

7576
# Initialize data fetching attributes.
76-
self._batches = dataframe.to_pandas_batches(page_size=initial_page_size)
77+
self._batches = dataframe._to_pandas_batches(page_size=initial_page_size)
7778

7879
# set traitlets properties that trigger observers
7980
self.page_size = initial_page_size
@@ -82,7 +83,9 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
8283
# SELECT COUNT(*) query. It is a must have however.
8384
# TODO(b/428238610): Start iterating over the result of `to_pandas_batches()`
8485
# before we get here so that the count might already be cached.
85-
self.row_count = len(dataframe)
86+
# TODO(b/452747934): Allow row_count to be None and check to see if
87+
# there are multiple pages and show "page 1 of many" in this case.
88+
self.row_count = self._batches.total_rows or 0
8689

8790
# get the initial page
8891
self._set_table_html()
@@ -180,7 +183,7 @@ def _cached_data(self) -> pd.DataFrame:
180183

181184
def _reset_batches_for_new_page_size(self):
182185
"""Reset the batch iterator when page size changes."""
183-
self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size)
186+
self._batches = self._dataframe._to_pandas_batches(page_size=self.page_size)
184187
self._cached_batches = []
185188
self._batch_iter = None
186189
self._all_data_loaded = False

tests/benchmark/read_gbq_colab/aggregate_output.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,9 @@ def aggregate_output(*, project_id, dataset_id, table_id):
2626
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")
2727

2828
# Simulate getting the first page, since we'll always do that first in the UI.
29-
df.shape
30-
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
29+
batches = df._to_pandas_batches(page_size=PAGE_SIZE)
30+
assert (tr := batches.total_rows) is not None and tr >= 0
31+
next(iter(batches))
3132

3233
# To simulate very small rows that can only fit a boolean,
3334
# some tables don't have an integer column. If an integer column is available,
@@ -43,8 +44,9 @@ def aggregate_output(*, project_id, dataset_id, table_id):
4344
.sum(numeric_only=True)
4445
)
4546

46-
df_aggregated.shape
47-
next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)))
47+
batches = df_aggregated._to_pandas_batches(page_size=PAGE_SIZE)
48+
assert (tr := batches.total_rows) is not None and tr >= 0
49+
next(iter(batches))
4850

4951

5052
if __name__ == "__main__":

tests/benchmark/read_gbq_colab/filter_output.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,19 @@ def filter_output(
3131
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")
3232

3333
# Simulate getting the first page, since we'll always do that first in the UI.
34-
df.shape
35-
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
34+
batches = df._to_pandas_batches(page_size=PAGE_SIZE)
35+
assert (tr := batches.total_rows) is not None and tr >= 0
36+
next(iter(batches))
3637

3738
# Simulate the user filtering by a column and visualizing those results
3839
df_filtered = df[df["col_bool_0"]]
39-
rows, _ = df_filtered.shape
40+
batches = df_filtered._to_pandas_batches(page_size=PAGE_SIZE)
41+
assert (tr := batches.total_rows) is not None and tr >= 0
42+
first_page = next(iter(batches))
4043

4144
# It's possible we don't have any pages at all, since we filtered out all
4245
# matching rows.
43-
first_page = next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
44-
assert len(first_page.index) <= rows
46+
assert len(first_page.index) <= tr
4547

4648

4749
if __name__ == "__main__":

tests/benchmark/read_gbq_colab/first_page.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ def first_page(*, project_id, dataset_id, table_id):
2828
)
2929

3030
# Get number of rows (to calculate number of pages) and the first page.
31-
df.shape
32-
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
31+
batches = df._to_pandas_batches(page_size=PAGE_SIZE)
32+
assert (tr := batches.total_rows) is not None and tr >= 0
33+
next(iter(batches))
3334

3435

3536
if __name__ == "__main__":

tests/benchmark/read_gbq_colab/last_page.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ def last_page(*, project_id, dataset_id, table_id):
2828
)
2929

3030
# Get number of rows (to calculate number of pages) and then all pages.
31-
df.shape
32-
for _ in df.to_pandas_batches(page_size=PAGE_SIZE):
31+
batches = df._to_pandas_batches(page_size=PAGE_SIZE)
32+
assert (tr := batches.total_rows) is not None and tr >= 0
33+
for _ in batches:
3334
pass
3435

3536

tests/benchmark/read_gbq_colab/sort_output.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,19 @@ def sort_output(*, project_id, dataset_id, table_id):
2828
)
2929

3030
# Simulate getting the first page, since we'll always do that first in the UI.
31-
df.shape
32-
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
31+
batches = df._to_pandas_batches(page_size=PAGE_SIZE)
32+
assert (tr := batches.total_rows) is not None and tr >= 0
33+
next(iter(batches))
3334

3435
# Simulate the user sorting by a column and visualizing those results
3536
sort_column = "col_int64_1"
3637
if sort_column not in df.columns:
3738
sort_column = "col_bool_0"
3839

3940
df_sorted = df.sort_values(sort_column)
40-
df_sorted.shape
41-
next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE)))
41+
batches = df_sorted._to_pandas_batches(page_size=PAGE_SIZE)
42+
assert (tr := batches.total_rows) is not None and tr >= 0
43+
next(iter(batches))
4244

4345

4446
if __name__ == "__main__":

0 commit comments

Comments
 (0)