googleapis · shuoweil · Jul 24, 2025 · Jul 24, 2025 · Jul 24, 2025 · Jul 25, 2025
@@ -17,12 +17,14 @@
 from importlib import resources
 import functools
 import math
-from typing import Any, Dict, Iterator, List, Optional, Type
+import typing
+from typing import Any, cast, Dict, Iterator, List, Optional, Type
 import uuid
 
 import pandas as pd
 
 import bigframes
+import bigframes.core.blocks
 import bigframes.display.html
 
 # anywidget and traitlets are optional dependencies. We don't want the import of this
@@ -45,8 +47,10 @@
 
 
 class TableWidget(WIDGET_BASE):
-    """
-    An interactive, paginated table widget for BigFrames DataFrames.
+    """An interactive, paginated table widget for BigFrames DataFrames.
+
+    This widget provides a user-friendly way to display and navigate through
+    large BigQuery DataFrames within a Jupyter environment.
     """
 
     def __init__(self, dataframe: bigframes.dataframe.DataFrame):
@@ -63,28 +67,31 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
         super().__init__()
         self._dataframe = dataframe
 
-        # Initialize attributes that might be needed by observers FIRST
+        # Initialize attributes that might be needed by observers first
         self._table_id = str(uuid.uuid4())
         self._all_data_loaded = False
         self._batch_iter: Optional[Iterator[pd.DataFrame]] = None
         self._cached_batches: List[pd.DataFrame] = []
 
-        # respect display options for initial page size
+        # Respect display options for initial page size
         initial_page_size = bigframes.options.display.max_rows
 
-        # Initialize data fetching attributes.
-        self._batches = dataframe.to_pandas_batches(page_size=initial_page_size)
+        batches = dataframe.to_pandas_batches(
+            page_size=initial_page_size,
+        )
+        self._batches: bigframes.core.blocks.PandasBatches = cast(
+            bigframes.core.blocks.PandasBatches, batches
+        )
+
+        # The query issued by `to_pandas_batches()` already contains metadata
+        # about how many results there were. Use that to avoid doing an extra
+        # COUNT(*) query that `len(...)` would do.
+        self.row_count = self._batches.total_rows or 0
 
-        # set traitlets properties that trigger observers
+        # Set page_size after _batches is available since traitlets observers
+        # may depend on _batches being initialized when the change trigger happens
         self.page_size = initial_page_size
 
-        # len(dataframe) is expensive, since it will trigger a
-        # SELECT COUNT(*) query. It is a must have however.
-        # TODO(b/428238610): Start iterating over the result of `to_pandas_batches()`
-        # before we get here so that the count might already be cached.
-        self.row_count = len(dataframe)
-
-        # get the initial page
         self._set_table_html()
 
     @functools.cached_property
@@ -160,15 +167,17 @@ def _get_next_batch(self) -> bool:
             batch = next(iterator)
             self._cached_batches.append(batch)
             return True
-        except StopIteration:
+        except StopIteration as e:
             self._all_data_loaded = True
+            if not isinstance(e, StopIteration):
+                # If we fail to get a batch, assume no more data is available.
+                self.row_count = 0
             return False
 
     @property
     def _batch_iterator(self) -> Iterator[pd.DataFrame]:
         """Lazily initializes and returns the batch iterator."""
-        if self._batch_iter is None:
-            self._batch_iter = iter(self._batches)
+        self._batch_iter = iter(self._batches)
         return self._batch_iter
 
     @property
@@ -180,7 +189,8 @@ def _cached_data(self) -> pd.DataFrame:
 
     def _reset_batches_for_new_page_size(self):
         """Reset the batch iterator when page size changes."""
-        self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size)
+        batches = self._dataframe.to_pandas_batches(page_size=self.page_size)
+        self._batches = typing.cast(bigframes.core.blocks.PandasBatches, batches)
         self._cached_batches = []
         self._batch_iter = None
         self._all_data_loaded = False

@@ -76,7 +76,7 @@
     {
      "data": {
       "text/html": [
-       "Query job a643d120-4af9-44fc-ba3c-ed461cf1092b is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:a643d120-4af9-44fc-ba3c-ed461cf1092b&page=queryresults\">Open Job</a>"
+       "Query job 1ea2b594-2bd7-46de-a3c8-6aeee5884ba2 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:1ea2b594-2bd7-46de-a3c8-6aeee5884ba2&page=queryresults\">Open Job</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -139,10 +139,22 @@
    "id": "ce250157",
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job 67e679e9-94da-47f7-8be1-8b4a496fbfbd is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:67e679e9-94da-47f7-8be1-8b4a496fbfbd&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d2d4ef22ea9f414b89ea5bd85f0e6635",
+       "model_id": "e74c3920b93644a0b2afdaa3841cad31",
        "version_major": 2,
        "version_minor": 1
       },
@@ -193,7 +205,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "121e3d2f28004036a922e3a11a08d4b7",
+       "model_id": "b4f7a3f86ef54e07b24ef10061088391",
        "version_major": 2,
        "version_minor": 1
       },
@@ -287,7 +299,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5ed335bbbc064e5391ea06a9a218642e",
+       "model_id": "44a829aca2f24cfdba4b61afd1a259fe",
        "version_major": 2,
        "version_minor": 1
       },

@@ -5366,7 +5366,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "venv",
    "language": "python",
    "name": "python3"
   },
@@ -5380,7 +5380,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.1"
+   "version": "3.10.15"
   }
  },
  "nbformat": 4,

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pathlib
+import typing
 
 import benchmark.utils as utils
 
@@ -26,8 +27,9 @@ def aggregate_output(*, project_id, dataset_id, table_id):
     df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")
 
     # Simulate getting the first page, since we'll always do that first in the UI.
-    df.shape
-    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+    batches = df.to_pandas_batches(page_size=PAGE_SIZE)
+    assert typing.cast(typing.Any, batches).total_rows >= 0
+    next(iter(batches))
-    next(iter(batches))
+    assert batches.total_rows >= 0
+    next(iter(batches))
-    next(iter(batches))
+    assert batches.total_rows >= 0
+    next(iter(batches))
 
     # To simulate very small rows that can only fit a boolean,
     # some tables don't have an integer column. If an integer column is available,
@@ -43,8 +45,8 @@ def aggregate_output(*, project_id, dataset_id, table_id):
         .sum(numeric_only=True)
     )
 
-    df_aggregated.shape
-    next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)))
+    batches_aggregated = df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)
+    next(iter(batches_aggregated))
 
 
 if __name__ == "__main__":

@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pathlib
+import typing
 
 import benchmark.utils as utils
 
+import bigframes.core.blocks
 import bigframes.pandas as bpd
 
 PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
@@ -31,16 +33,20 @@ def filter_output(
     df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")
 
     # Simulate getting the first page, since we'll always do that first in the UI.
-    df.shape
-    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+    batches = df.to_pandas_batches(page_size=PAGE_SIZE)
+    next(iter(batches))
 
     # Simulate the user filtering by a column and visualizing those results
     df_filtered = df[df["col_bool_0"]]
-    rows, _ = df_filtered.shape
-
+    batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE)
+    batches_filtered = typing.cast(
+        bigframes.core.blocks.PandasBatches, batches_filtered
+    )
+    rows = batches_filtered.total_rows
+    assert rows >= 0
     # It's possible we don't have any pages at all, since we filtered out all
     # matching rows.
-    first_page = next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
+    first_page = next(iter(batches_filtered))
     assert len(first_page.index) <= rows
 
 

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pathlib
+import typing
 
 import benchmark.utils as utils
 
@@ -28,8 +29,10 @@ def first_page(*, project_id, dataset_id, table_id):
     )
 
     # Get number of rows (to calculate number of pages) and the first page.
-    df.shape
-    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+    batches = df.to_pandas_batches(page_size=PAGE_SIZE)
+    assert typing.cast(typing.Any, batches).total_rows >= 0
+    first_page = next(iter(batches))
+    assert first_page is not None
 
 
 if __name__ == "__main__":

@@ -28,8 +28,8 @@ def last_page(*, project_id, dataset_id, table_id):
     )
 
     # Get number of rows (to calculate number of pages) and then all pages.
-    df.shape
-    for _ in df.to_pandas_batches(page_size=PAGE_SIZE):
+    batches = df.to_pandas_batches(page_size=PAGE_SIZE)
+    for _ in batches:
         pass
 
 

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pathlib
+import typing
 
 import benchmark.utils as utils
 
@@ -28,17 +29,19 @@ def sort_output(*, project_id, dataset_id, table_id):
     )
 
     # Simulate getting the first page, since we'll always do that first in the UI.
-    df.shape
-    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+    batches = df.to_pandas_batches(page_size=PAGE_SIZE)
+    assert typing.cast(typing.Any, batches).total_rows >= 0
+    next(iter(batches))
 
     # Simulate the user sorting by a column and visualizing those results
     sort_column = "col_int64_1"
     if sort_column not in df.columns:
         sort_column = "col_bool_0"
 
     df_sorted = df.sort_values(sort_column)
-    df_sorted.shape
-    next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE)))
+    batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE)
+    assert typing.cast(typing.Any, batches_sorted).total_rows >= 0
+    next(iter(batches_sorted))
 
 
 if __name__ == "__main__":