change code and update more testcase

shuoweil · shuoweil · commit 0881bf6df89f · 2025-08-14T21:05:16.000Z
diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py
@@ -72,26 +72,27 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
         self._table_id = str(uuid.uuid4())
         self._all_data_loaded = False
         self._batch_iter: Optional[Iterator[pd.DataFrame]] = None
-        self._batches: Optional[bigframes.core.blocks.PandasBatches] = None
         self._cached_batches: List[pd.DataFrame] = []
 
         # Respect display options for initial page size
         initial_page_size = bigframes.options.display.max_rows
 
-        # Fetches initial data batches and row count for display.
         batches = dataframe.to_pandas_batches(
             page_size=initial_page_size,
         )
-        self._batches = cast(bigframes.core.blocks.PandasBatches, batches)
+        self._batches: bigframes.core.blocks.PandasBatches = cast(
+            bigframes.core.blocks.PandasBatches, batches
+        )
 
-        # Use total_rwos from batches directly
+        # The query issued by `to_pandas_batches()` already contains metadata
+        # about how many results there were. Use that to avoid doing an extra
+        # COUNT(*) query that `len(...)` would do.
         self.row_count = self._batches.total_rows or 0
 
         # Set page_size after _batches is available since traitlets observers
         # may depend on _batches being initialized when the change trigger happens
         self.page_size = initial_page_size
 
-        # Generates the initial HTML table content
         self._set_table_html()
 
     @functools.cached_property
@@ -182,11 +183,7 @@ def _get_next_batch(self) -> bool:
     @property
     def _batch_iterator(self) -> Iterator[pd.DataFrame]:
         """Lazily initializes and returns the batch iterator."""
-        if self._batch_iter is None:
-            if self._batches is None:
-                self._batch_iter = iter([])
-            else:
-                self._batch_iter = iter(self._batches)
+        self._batch_iter = iter(self._batches)
         return self._batch_iter
 
     @property
diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py
@@ -26,8 +26,8 @@ def aggregate_output(*, project_id, dataset_id, table_id):
     df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")
 
     # Simulate getting the first page, since we'll always do that first in the UI.
-    df.shape
-    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+    batches = df.to_pandas_batches(page_size=PAGE_SIZE)
+    next(iter(batches))
 
     # To simulate very small rows that can only fit a boolean,
     # some tables don't have an integer column. If an integer column is available,
@@ -43,8 +43,8 @@ def aggregate_output(*, project_id, dataset_id, table_id):
         .sum(numeric_only=True)
     )
 
-    df_aggregated.shape
-    next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)))
+    batches_aggregated = df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)
+    next(iter(batches_aggregated))
 
 
 if __name__ == "__main__":
diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py
@@ -31,16 +31,18 @@ def filter_output(
     df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")
 
     # Simulate getting the first page, since we'll always do that first in the UI.
-    df.shape
-    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+    batches = df.to_pandas_batches(page_size=PAGE_SIZE)
+    next(iter(batches))
 
     # Simulate the user filtering by a column and visualizing those results
     df_filtered = df[df["col_bool_0"]]
-    rows, _ = df_filtered.shape
+    batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE)
 
     # It's possible we don't have any pages at all, since we filtered out all
     # matching rows.
-    first_page = next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
+    first_page = next(iter(batches_filtered))
+    rows = batches_filtered.total_rows
+    assert rows is not None
     assert len(first_page.index) <= rows
 
 
diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pathlib
+import typing
 
 import benchmark.utils as utils
 
@@ -27,10 +28,12 @@ def first_page(*, project_id, dataset_id, table_id):
         f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
     )
 
-    # Use total_rows from batches directly and the first page
-    execute_result = df._block.session._executor.execute(df._block.expr, ordered=True)
-    execute_result.total_rows or 0
-    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+    # Get number of rows (to calculate number of pages) and the first page.
+    batches = df.to_pandas_batches(page_size=PAGE_SIZE)
+    first_page = next(iter(batches))
+    assert first_page is not None
+    total_rows = typing.cast(typing.Any, batches).total_rows
+    assert total_rows is not None
 
 
 if __name__ == "__main__":
diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py
@@ -27,9 +27,9 @@ def last_page(*, project_id, dataset_id, table_id):
         f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
     )
 
-    execute_result = df._block.session._executor.execute(df._block.expr, ordered=True)
-    execute_result.total_rows or 0
-    for _ in df.to_pandas_batches(page_size=PAGE_SIZE):
+    # Get number of rows (to calculate number of pages) and then all pages.
+    batches = df.to_pandas_batches(page_size=PAGE_SIZE)
+    for _ in batches:
         pass
 
 
diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py
@@ -28,17 +28,17 @@ def sort_output(*, project_id, dataset_id, table_id):
     )
 
     # Simulate getting the first page, since we'll always do that first in the UI.
-    df.shape
-    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+    batches = df.to_pandas_batches(page_size=PAGE_SIZE)
+    next(iter(batches))
 
     # Simulate the user sorting by a column and visualizing those results
     sort_column = "col_int64_1"
     if sort_column not in df.columns:
         sort_column = "col_bool_0"
 
     df_sorted = df.sort_values(sort_column)
-    df_sorted.shape
-    next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE)))
+    batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE)
+    next(iter(batches_sorted))
 
 
 if __name__ == "__main__":