Skip to content

Commit 0881bf6

Browse files
committed
change code and update more testcase
1 parent e0d78e0 commit 0881bf6

File tree

6 files changed

+31
-29
lines changed

6 files changed

+31
-29
lines changed

bigframes/display/anywidget.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -72,26 +72,27 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
7272
self._table_id = str(uuid.uuid4())
7373
self._all_data_loaded = False
7474
self._batch_iter: Optional[Iterator[pd.DataFrame]] = None
75-
self._batches: Optional[bigframes.core.blocks.PandasBatches] = None
7675
self._cached_batches: List[pd.DataFrame] = []
7776

7877
# Respect display options for initial page size
7978
initial_page_size = bigframes.options.display.max_rows
8079

81-
# Fetches initial data batches and row count for display.
8280
batches = dataframe.to_pandas_batches(
8381
page_size=initial_page_size,
8482
)
85-
self._batches = cast(bigframes.core.blocks.PandasBatches, batches)
83+
self._batches: bigframes.core.blocks.PandasBatches = cast(
84+
bigframes.core.blocks.PandasBatches, batches
85+
)
8686

87-
# Use total_rwos from batches directly
87+
# The query issued by `to_pandas_batches()` already contains metadata
88+
# about how many results there were. Use that to avoid doing an extra
89+
# COUNT(*) query that `len(...)` would do.
8890
self.row_count = self._batches.total_rows or 0
8991

9092
# Set page_size after _batches is available since traitlets observers
9193
# may depend on _batches being initialized when the change trigger happens
9294
self.page_size = initial_page_size
9395

94-
# Generates the initial HTML table content
9596
self._set_table_html()
9697

9798
@functools.cached_property
@@ -182,11 +183,7 @@ def _get_next_batch(self) -> bool:
182183
@property
183184
def _batch_iterator(self) -> Iterator[pd.DataFrame]:
184185
"""Lazily initializes and returns the batch iterator."""
185-
if self._batch_iter is None:
186-
if self._batches is None:
187-
self._batch_iter = iter([])
188-
else:
189-
self._batch_iter = iter(self._batches)
186+
self._batch_iter = iter(self._batches)
190187
return self._batch_iter
191188

192189
@property

tests/benchmark/read_gbq_colab/aggregate_output.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ def aggregate_output(*, project_id, dataset_id, table_id):
2626
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")
2727

2828
# Simulate getting the first page, since we'll always do that first in the UI.
29-
df.shape
30-
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
29+
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
30+
next(iter(batches))
3131

3232
# To simulate very small rows that can only fit a boolean,
3333
# some tables don't have an integer column. If an integer column is available,
@@ -43,8 +43,8 @@ def aggregate_output(*, project_id, dataset_id, table_id):
4343
.sum(numeric_only=True)
4444
)
4545

46-
df_aggregated.shape
47-
next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)))
46+
batches_aggregated = df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)
47+
next(iter(batches_aggregated))
4848

4949

5050
if __name__ == "__main__":

tests/benchmark/read_gbq_colab/filter_output.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,18 @@ def filter_output(
3131
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")
3232

3333
# Simulate getting the first page, since we'll always do that first in the UI.
34-
df.shape
35-
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
34+
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
35+
next(iter(batches))
3636

3737
# Simulate the user filtering by a column and visualizing those results
3838
df_filtered = df[df["col_bool_0"]]
39-
rows, _ = df_filtered.shape
39+
batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE)
4040

4141
# It's possible we don't have any pages at all, since we filtered out all
4242
# matching rows.
43-
first_page = next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
43+
first_page = next(iter(batches_filtered))
44+
rows = batches_filtered.total_rows
45+
assert rows is not None
4446
assert len(first_page.index) <= rows
4547

4648

tests/benchmark/read_gbq_colab/first_page.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
import pathlib
15+
import typing
1516

1617
import benchmark.utils as utils
1718

@@ -27,10 +28,12 @@ def first_page(*, project_id, dataset_id, table_id):
2728
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
2829
)
2930

30-
# Use total_rows from batches directly and the first page
31-
execute_result = df._block.session._executor.execute(df._block.expr, ordered=True)
32-
execute_result.total_rows or 0
33-
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
31+
# Get number of rows (to calculate number of pages) and the first page.
32+
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
33+
first_page = next(iter(batches))
34+
assert first_page is not None
35+
total_rows = typing.cast(typing.Any, batches).total_rows
36+
assert total_rows is not None
3437

3538

3639
if __name__ == "__main__":

tests/benchmark/read_gbq_colab/last_page.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ def last_page(*, project_id, dataset_id, table_id):
2727
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
2828
)
2929

30-
execute_result = df._block.session._executor.execute(df._block.expr, ordered=True)
31-
execute_result.total_rows or 0
32-
for _ in df.to_pandas_batches(page_size=PAGE_SIZE):
30+
# Get number of rows (to calculate number of pages) and then all pages.
31+
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
32+
for _ in batches:
3333
pass
3434

3535

tests/benchmark/read_gbq_colab/sort_output.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,17 @@ def sort_output(*, project_id, dataset_id, table_id):
2828
)
2929

3030
# Simulate getting the first page, since we'll always do that first in the UI.
31-
df.shape
32-
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
31+
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
32+
next(iter(batches))
3333

3434
# Simulate the user sorting by a column and visualizing those results
3535
sort_column = "col_int64_1"
3636
if sort_column not in df.columns:
3737
sort_column = "col_bool_0"
3838

3939
df_sorted = df.sort_values(sort_column)
40-
df_sorted.shape
41-
next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE)))
40+
batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE)
41+
next(iter(batches_sorted))
4242

4343

4444
if __name__ == "__main__":

0 commit comments

Comments
 (0)