diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py
index 5a20ddcb7f..1ca57e89ef 100644
--- a/bigframes/display/anywidget.py
+++ b/bigframes/display/anywidget.py
@@ -23,6 +23,7 @@
import pandas as pd
import bigframes
+import bigframes.core.blocks
import bigframes.display.html
# anywidget and traitlets are optional dependencies. We don't want the import of this
@@ -45,8 +46,10 @@
class TableWidget(WIDGET_BASE):
- """
- An interactive, paginated table widget for BigFrames DataFrames.
+ """An interactive, paginated table widget for BigFrames DataFrames.
+
+ This widget provides a user-friendly way to display and navigate through
+ large BigQuery DataFrames within a Jupyter environment.
"""
def __init__(self, dataframe: bigframes.dataframe.DataFrame):
@@ -63,28 +66,34 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
super().__init__()
self._dataframe = dataframe
- # Initialize attributes that might be needed by observers FIRST
+ # Initialize attributes that might be needed by observers first
self._table_id = str(uuid.uuid4())
self._all_data_loaded = False
self._batch_iter: Optional[Iterator[pd.DataFrame]] = None
self._cached_batches: List[pd.DataFrame] = []
- # respect display options for initial page size
+ # Respect display options for initial page size
initial_page_size = bigframes.options.display.max_rows
- # Initialize data fetching attributes.
- self._batches = dataframe.to_pandas_batches(page_size=initial_page_size)
+ execute_result = dataframe._block.session._executor.execute(
+ dataframe._block.expr,
+ ordered=True,
+ use_explicit_destination=True,
+ )
+
+ # The query issued by `to_pandas_batches()` already contains metadata
+ # about how many results there were. Use that to avoid doing an extra
+ # COUNT(*) query that `len(...)` would do.
+ self.row_count = execute_result.total_rows or 0
- # set traitlets properties that trigger observers
- self.page_size = initial_page_size
+ # Create pandas batches from the ExecuteResult
+ self._batches = execute_result.to_pandas_batches(page_size=initial_page_size)
- # len(dataframe) is expensive, since it will trigger a
- # SELECT COUNT(*) query. It is a must have however.
- # TODO(b/428238610): Start iterating over the result of `to_pandas_batches()`
- # before we get here so that the count might already be cached.
- self.row_count = len(dataframe)
+ # Set page_size after _batches is available, but avoid triggering observers
+ # by setting the underlying traitlet value directly
+ self._trait_values["page_size"] = initial_page_size
+ self._trait_notifiers["page_size"] = {} # Initialize notifiers if needed
- # get the initial page
self._set_table_html()
@functools.cached_property
@@ -167,8 +176,7 @@ def _get_next_batch(self) -> bool:
@property
def _batch_iterator(self) -> Iterator[pd.DataFrame]:
"""Lazily initializes and returns the batch iterator."""
- if self._batch_iter is None:
- self._batch_iter = iter(self._batches)
+ self._batch_iter = iter(self._batches)
return self._batch_iter
@property
@@ -180,7 +188,16 @@ def _cached_data(self) -> pd.DataFrame:
def _reset_batches_for_new_page_size(self):
"""Reset the batch iterator when page size changes."""
- self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size)
+ # Execute with explicit destination for consistency with __init__
+ execute_result = self._dataframe._block.session._executor.execute(
+ self._dataframe._block.expr,
+ ordered=True,
+ use_explicit_destination=True,
+ )
+
+ # Create pandas batches from the ExecuteResult
+ self._batches = execute_result.to_pandas_batches(page_size=self.page_size)
+
self._cached_batches = []
self._batch_iter = None
self._all_data_loaded = False
diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb
index 617329ba65..34d9fae12b 100644
--- a/notebooks/dataframes/anywidget_mode.ipynb
+++ b/notebooks/dataframes/anywidget_mode.ipynb
@@ -73,18 +73,6 @@
"id": "f289d250",
"metadata": {},
"outputs": [
- {
- "data": {
- "text/html": [
- "Query job a643d120-4af9-44fc-ba3c-ed461cf1092b is DONE. 0 Bytes processed. Open Job"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
{
"name": "stdout",
"output_type": "stream",
@@ -139,15 +127,27 @@
"id": "ce250157",
"metadata": {},
"outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Query job 087c4276-8c26-467f-852b-c0d31848f666 is DONE. 171.4 MB processed. Open Job"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "d2d4ef22ea9f414b89ea5bd85f0e6635",
+ "model_id": "c2a4111b39c3462a8d0f4f2e4a01635b",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
- "TableWidget(page_size=10, row_count=5552452, table_html='= 0
+ batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
+ next(iter(batches))
# To simulate very small rows that can only fit a boolean,
# some tables don't have an integer column. If an integer column is available,
@@ -42,9 +48,19 @@ def aggregate_output(*, project_id, dataset_id, table_id):
.groupby("rounded")
.sum(numeric_only=True)
)
-
- df_aggregated.shape
- next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)))
+ execute_result_aggregated = df_aggregated._block.session._executor.execute(
+ df_aggregated._block.expr,
+ ordered=True,
+ use_explicit_destination=True,
+ )
+ assert (
+ execute_result_aggregated.total_rows is not None
+ and execute_result_aggregated.total_rows >= 0
+ )
+ batches_aggregated = execute_result_aggregated.to_pandas_batches(
+ page_size=PAGE_SIZE
+ )
+ next(iter(batches_aggregated))
if __name__ == "__main__":
diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py
index b3c9181770..b1dfdf3424 100644
--- a/tests/benchmark/read_gbq_colab/filter_output.py
+++ b/tests/benchmark/read_gbq_colab/filter_output.py
@@ -31,16 +31,32 @@ def filter_output(
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")
# Simulate getting the first page, since we'll always do that first in the UI.
- df.shape
- next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+ # Force BigQuery execution to get total_rows metadata
+ execute_result = df._block.session._executor.execute(
+ df._block.expr,
+ ordered=True,
+ use_explicit_destination=True,
+ )
+ batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
+ next(iter(batches))
# Simulate the user filtering by a column and visualizing those results
df_filtered = df[df["col_bool_0"]]
- rows, _ = df_filtered.shape
+ # Force BigQuery execution for filtered DataFrame to get total_rows metadata
+ execute_result_filtered = df_filtered._block.session._executor.execute(
+ df_filtered._block.expr,
+ ordered=True,
+ use_explicit_destination=True,
+ )
+
+ rows = execute_result_filtered.total_rows or 0
+ assert rows >= 0
+
+ batches_filtered = execute_result_filtered.to_pandas_batches(page_size=PAGE_SIZE)
# It's possible we don't have any pages at all, since we filtered out all
# matching rows.
- first_page = next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
+ first_page = next(iter(batches_filtered))
assert len(first_page.index) <= rows
diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py
index 7f8cdb0d51..90bd4024cb 100644
--- a/tests/benchmark/read_gbq_colab/first_page.py
+++ b/tests/benchmark/read_gbq_colab/first_page.py
@@ -28,8 +28,15 @@ def first_page(*, project_id, dataset_id, table_id):
)
# Get number of rows (to calculate number of pages) and the first page.
- df.shape
- next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+ execute_result = df._block.session._executor.execute(
+ df._block.expr,
+ ordered=True,
+ use_explicit_destination=True,
+ )
+ assert execute_result.total_rows is not None and execute_result.total_rows >= 0
+ batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
+ first_page = next(iter(batches))
+ assert first_page is not None
if __name__ == "__main__":
diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py
index 7786e2f8bd..e00b304900 100644
--- a/tests/benchmark/read_gbq_colab/last_page.py
+++ b/tests/benchmark/read_gbq_colab/last_page.py
@@ -28,8 +28,8 @@ def last_page(*, project_id, dataset_id, table_id):
)
# Get number of rows (to calculate number of pages) and then all pages.
- df.shape
- for _ in df.to_pandas_batches(page_size=PAGE_SIZE):
+ batches = df.to_pandas_batches(page_size=PAGE_SIZE)
+ for _ in batches:
pass
diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py
index 7933c4472e..9724373dde 100644
--- a/tests/benchmark/read_gbq_colab/sort_output.py
+++ b/tests/benchmark/read_gbq_colab/sort_output.py
@@ -28,8 +28,14 @@ def sort_output(*, project_id, dataset_id, table_id):
)
# Simulate getting the first page, since we'll always do that first in the UI.
- df.shape
- next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+ execute_result = df._block.session._executor.execute(
+ df._block.expr,
+ ordered=True,
+ use_explicit_destination=True,
+ )
+ assert execute_result.total_rows is not None and execute_result.total_rows >= 0
+ batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
+ next(iter(batches))
# Simulate the user sorting by a column and visualizing those results
sort_column = "col_int64_1"
@@ -37,8 +43,17 @@ def sort_output(*, project_id, dataset_id, table_id):
sort_column = "col_bool_0"
df_sorted = df.sort_values(sort_column)
- df_sorted.shape
- next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE)))
+ execute_result_sorted = df_sorted._block.session._executor.execute(
+ df_sorted._block.expr,
+ ordered=True,
+ use_explicit_destination=True,
+ )
+ assert (
+ execute_result_sorted.total_rows is not None
+ and execute_result_sorted.total_rows >= 0
+ )
+ batches_sorted = execute_result_sorted.to_pandas_batches(page_size=PAGE_SIZE)
+ next(iter(batches_sorted))
if __name__ == "__main__":
diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py
index 8a91176dd9..2103c52dbb 100644
--- a/tests/system/small/test_anywidget.py
+++ b/tests/system/small/test_anywidget.py
@@ -61,11 +61,12 @@ def table_widget(paginated_bf_df: bf.dataframe.DataFrame):
Helper fixture to create a TableWidget instance with a fixed page size.
This reduces duplication across tests that use the same widget configuration.
"""
- from bigframes import display
+
+ from bigframes.display import TableWidget
with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2):
# Delay context manager cleanup of `max_rows` until after tests finish.
- yield display.TableWidget(paginated_bf_df)
+ yield TableWidget(paginated_bf_df)
@pytest.fixture(scope="module")
@@ -90,10 +91,10 @@ def small_bf_df(
@pytest.fixture
def small_widget(small_bf_df):
"""Helper fixture for tests using a DataFrame smaller than the page size."""
- from bigframes import display
+ from bigframes.display import TableWidget
with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 5):
- yield display.TableWidget(small_bf_df)
+ yield TableWidget(small_bf_df)
@pytest.fixture(scope="module")
@@ -109,6 +110,23 @@ def empty_bf_df(
return session.read_pandas(empty_pandas_df)
+def mock_execute_result_with_params(
+ self, schema, total_rows_val, arrow_batches_val, *args, **kwargs
+):
+ """
+ Mocks an execution result with configurable total_rows and arrow_batches.
+ """
+ from bigframes.session.executor import ExecuteResult
+
+ return ExecuteResult(
+ iter(arrow_batches_val),
+ schema=schema,
+ query_job=None,
+ total_bytes=None,
+ total_rows=total_rows_val,
+ )
+
+
def _assert_html_matches_pandas_slice(
table_html: str,
expected_pd_slice: pd.DataFrame,
@@ -135,10 +153,10 @@ def test_widget_initialization_should_calculate_total_row_count(
paginated_bf_df: bf.dataframe.DataFrame,
):
"""A TableWidget should correctly calculate the total row count on creation."""
- from bigframes import display
+ from bigframes.display import TableWidget
with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2):
- widget = display.TableWidget(paginated_bf_df)
+ widget = TableWidget(paginated_bf_df)
assert widget.row_count == EXPECTED_ROW_COUNT
@@ -436,6 +454,82 @@ def test_widget_creation_should_load_css_for_rendering(table_widget):
assert ".bigframes-widget .footer" in css_content
+def test_widget_row_count_should_be_immutable_after_creation(
+ paginated_bf_df: bf.dataframe.DataFrame,
+):
+ """
+ Given a widget created with a specific configuration when global display
+ options are changed later, the widget's original row_count should remain
+ unchanged.
+ """
+ from bigframes.display import TableWidget
+
+ # Use a context manager to ensure the option is reset
+ with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2):
+ widget = TableWidget(paginated_bf_df)
+ initial_row_count = widget.row_count
+
+ # Change a global option that could influence row count
+ bf.options.display.max_rows = 10
+
+ # Verify the row count remains immutable.
+ assert widget.row_count == initial_row_count
+
+
+class FaultyIterator:
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ raise ValueError("Simulated read error")
+
+
+def test_widget_should_fallback_to_zero_rows_with_invlid_total_rows(
+ paginated_bf_df: bf.dataframe.DataFrame,
+ monkeypatch: pytest.MonkeyPatch,
+):
+ """
+ Given an internal component fails to return valid execution data,
+ when the TableWidget is created, its row_count should safely fall back to 0.
+ """
+ # Patch the executor's 'execute' method to simulate an error.
+ monkeypatch.setattr(
+ "bigframes.session.bq_caching_executor.BigQueryCachingExecutor.execute",
+ lambda self, *args, **kwargs: mock_execute_result_with_params(
+ self, paginated_bf_df._block.expr.schema, None, [], *args, **kwargs
+ ),
+ )
+
+ # Create the TableWidget under the error condition.
+ with bf.option_context("display.repr_mode", "anywidget"):
+ from bigframes.display import TableWidget
+
+ # The widget should handle the faulty data from the mock without crashing.
+ widget = TableWidget(paginated_bf_df)
+
+ # The widget safely defaults to 0 rows.
+ assert widget.row_count == 0
+
+
+def test_widget_row_count_reflects_actual_data_available(
+ paginated_bf_df: bf.dataframe.DataFrame,
+):
+ """
+ Test that widget row_count reflects the actual data available,
+ regardless of theoretical limits.
+ """
+ from bigframes.display import TableWidget
+
+ # Set up display options that define a page size.
+ with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2):
+ widget = TableWidget(paginated_bf_df)
+
+ # The widget should report the total rows in the DataFrame,
+ # not limited by page_size (which only affects pagination)
+ assert widget.row_count == EXPECTED_ROW_COUNT
+ assert widget.page_size == 2 # Respects the display option
+
+
# TODO(shuowei): Add tests for custom index and multiindex
# This may not be necessary for the SQL Cell use case but should be
# considered for completeness.