diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 5a20ddcb7f..1ca57e89ef 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -23,6 +23,7 @@ import pandas as pd import bigframes +import bigframes.core.blocks import bigframes.display.html # anywidget and traitlets are optional dependencies. We don't want the import of this @@ -45,8 +46,10 @@ class TableWidget(WIDGET_BASE): - """ - An interactive, paginated table widget for BigFrames DataFrames. + """An interactive, paginated table widget for BigFrames DataFrames. + + This widget provides a user-friendly way to display and navigate through + large BigQuery DataFrames within a Jupyter environment. """ def __init__(self, dataframe: bigframes.dataframe.DataFrame): @@ -63,28 +66,34 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): super().__init__() self._dataframe = dataframe - # Initialize attributes that might be needed by observers FIRST + # Initialize attributes that might be needed by observers first self._table_id = str(uuid.uuid4()) self._all_data_loaded = False self._batch_iter: Optional[Iterator[pd.DataFrame]] = None self._cached_batches: List[pd.DataFrame] = [] - # respect display options for initial page size + # Respect display options for initial page size initial_page_size = bigframes.options.display.max_rows - # Initialize data fetching attributes. - self._batches = dataframe.to_pandas_batches(page_size=initial_page_size) + execute_result = dataframe._block.session._executor.execute( + dataframe._block.expr, + ordered=True, + use_explicit_destination=True, + ) + + # The query issued by `to_pandas_batches()` already contains metadata + # about how many results there were. Use that to avoid doing an extra + # COUNT(*) query that `len(...)` would do. + self.row_count = execute_result.total_rows or 0 - # set traitlets properties that trigger observers - self.page_size = initial_page_size + # Create pandas batches from the ExecuteResult + self._batches = execute_result.to_pandas_batches(page_size=initial_page_size) - # len(dataframe) is expensive, since it will trigger a - # SELECT COUNT(*) query. It is a must have however. - # TODO(b/428238610): Start iterating over the result of `to_pandas_batches()` - # before we get here so that the count might already be cached. - self.row_count = len(dataframe) + # Set page_size after _batches is available, but avoid triggering observers + # by setting the underlying traitlet value directly + self._trait_values["page_size"] = initial_page_size + self._trait_notifiers["page_size"] = {} # Initialize notifiers if needed - # get the initial page self._set_table_html() @functools.cached_property @@ -167,8 +176,7 @@ def _get_next_batch(self) -> bool: @property def _batch_iterator(self) -> Iterator[pd.DataFrame]: """Lazily initializes and returns the batch iterator.""" - if self._batch_iter is None: - self._batch_iter = iter(self._batches) + self._batch_iter = iter(self._batches) return self._batch_iter @property @@ -180,7 +188,16 @@ def _cached_data(self) -> pd.DataFrame: def _reset_batches_for_new_page_size(self): """Reset the batch iterator when page size changes.""" - self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size) + # Execute with explicit destination for consistency with __init__ + execute_result = self._dataframe._block.session._executor.execute( + self._dataframe._block.expr, + ordered=True, + use_explicit_destination=True, + ) + + # Create pandas batches from the ExecuteResult + self._batches = execute_result.to_pandas_batches(page_size=self.page_size) + self._cached_batches = [] self._batch_iter = None self._all_data_loaded = False diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 617329ba65..34d9fae12b 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -73,18 +73,6 @@ "id": "f289d250", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job a643d120-4af9-44fc-ba3c-ed461cf1092b is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -139,15 +127,27 @@ "id": "ce250157", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "Query job 087c4276-8c26-467f-852b-c0d31848f666 is DONE. 171.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d2d4ef22ea9f414b89ea5bd85f0e6635", + "model_id": "c2a4111b39c3462a8d0f4f2e4a01635b", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "TableWidget(page_size=10, row_count=5552452, table_html='= 0 + batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) + next(iter(batches)) # To simulate very small rows that can only fit a boolean, # some tables don't have an integer column. If an integer column is available, @@ -42,9 +48,19 @@ def aggregate_output(*, project_id, dataset_id, table_id): .groupby("rounded") .sum(numeric_only=True) ) - - df_aggregated.shape - next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE))) + execute_result_aggregated = df_aggregated._block.session._executor.execute( + df_aggregated._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert ( + execute_result_aggregated.total_rows is not None + and execute_result_aggregated.total_rows >= 0 + ) + batches_aggregated = execute_result_aggregated.to_pandas_batches( + page_size=PAGE_SIZE + ) + next(iter(batches_aggregated)) if __name__ == "__main__": diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index b3c9181770..b1dfdf3424 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -31,16 +31,32 @@ def filter_output( df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}") # Simulate getting the first page, since we'll always do that first in the UI. - df.shape - next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + # Force BigQuery execution to get total_rows metadata + execute_result = df._block.session._executor.execute( + df._block.expr, + ordered=True, + use_explicit_destination=True, + ) + batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) + next(iter(batches)) # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] - rows, _ = df_filtered.shape + # Force BigQuery execution for filtered DataFrame to get total_rows metadata + execute_result_filtered = df_filtered._block.session._executor.execute( + df_filtered._block.expr, + ordered=True, + use_explicit_destination=True, + ) + + rows = execute_result_filtered.total_rows or 0 + assert rows >= 0 + + batches_filtered = execute_result_filtered.to_pandas_batches(page_size=PAGE_SIZE) # It's possible we don't have any pages at all, since we filtered out all # matching rows. - first_page = next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) + first_page = next(iter(batches_filtered)) assert len(first_page.index) <= rows diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index 7f8cdb0d51..90bd4024cb 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -28,8 +28,15 @@ def first_page(*, project_id, dataset_id, table_id): ) # Get number of rows (to calculate number of pages) and the first page. - df.shape - next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + execute_result = df._block.session._executor.execute( + df._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert execute_result.total_rows is not None and execute_result.total_rows >= 0 + batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) + first_page = next(iter(batches)) + assert first_page is not None if __name__ == "__main__": diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py index 7786e2f8bd..e00b304900 100644 --- a/tests/benchmark/read_gbq_colab/last_page.py +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -28,8 +28,8 @@ def last_page(*, project_id, dataset_id, table_id): ) # Get number of rows (to calculate number of pages) and then all pages. - df.shape - for _ in df.to_pandas_batches(page_size=PAGE_SIZE): + batches = df.to_pandas_batches(page_size=PAGE_SIZE) + for _ in batches: pass diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index 7933c4472e..9724373dde 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -28,8 +28,14 @@ def sort_output(*, project_id, dataset_id, table_id): ) # Simulate getting the first page, since we'll always do that first in the UI. - df.shape - next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + execute_result = df._block.session._executor.execute( + df._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert execute_result.total_rows is not None and execute_result.total_rows >= 0 + batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) + next(iter(batches)) # Simulate the user sorting by a column and visualizing those results sort_column = "col_int64_1" @@ -37,8 +43,17 @@ def sort_output(*, project_id, dataset_id, table_id): sort_column = "col_bool_0" df_sorted = df.sort_values(sort_column) - df_sorted.shape - next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE))) + execute_result_sorted = df_sorted._block.session._executor.execute( + df_sorted._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert ( + execute_result_sorted.total_rows is not None + and execute_result_sorted.total_rows >= 0 + ) + batches_sorted = execute_result_sorted.to_pandas_batches(page_size=PAGE_SIZE) + next(iter(batches_sorted)) if __name__ == "__main__": diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 8a91176dd9..2103c52dbb 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -61,11 +61,12 @@ def table_widget(paginated_bf_df: bf.dataframe.DataFrame): Helper fixture to create a TableWidget instance with a fixed page size. This reduces duplication across tests that use the same widget configuration. """ - from bigframes import display + + from bigframes.display import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): # Delay context manager cleanup of `max_rows` until after tests finish. - yield display.TableWidget(paginated_bf_df) + yield TableWidget(paginated_bf_df) @pytest.fixture(scope="module") @@ -90,10 +91,10 @@ def small_bf_df( @pytest.fixture def small_widget(small_bf_df): """Helper fixture for tests using a DataFrame smaller than the page size.""" - from bigframes import display + from bigframes.display import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 5): - yield display.TableWidget(small_bf_df) + yield TableWidget(small_bf_df) @pytest.fixture(scope="module") @@ -109,6 +110,23 @@ def empty_bf_df( return session.read_pandas(empty_pandas_df) +def mock_execute_result_with_params( + self, schema, total_rows_val, arrow_batches_val, *args, **kwargs +): + """ + Mocks an execution result with configurable total_rows and arrow_batches. + """ + from bigframes.session.executor import ExecuteResult + + return ExecuteResult( + iter(arrow_batches_val), + schema=schema, + query_job=None, + total_bytes=None, + total_rows=total_rows_val, + ) + + def _assert_html_matches_pandas_slice( table_html: str, expected_pd_slice: pd.DataFrame, @@ -135,10 +153,10 @@ def test_widget_initialization_should_calculate_total_row_count( paginated_bf_df: bf.dataframe.DataFrame, ): """A TableWidget should correctly calculate the total row count on creation.""" - from bigframes import display + from bigframes.display import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - widget = display.TableWidget(paginated_bf_df) + widget = TableWidget(paginated_bf_df) assert widget.row_count == EXPECTED_ROW_COUNT @@ -436,6 +454,82 @@ def test_widget_creation_should_load_css_for_rendering(table_widget): assert ".bigframes-widget .footer" in css_content +def test_widget_row_count_should_be_immutable_after_creation( + paginated_bf_df: bf.dataframe.DataFrame, +): + """ + Given a widget created with a specific configuration when global display + options are changed later, the widget's original row_count should remain + unchanged. + """ + from bigframes.display import TableWidget + + # Use a context manager to ensure the option is reset + with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): + widget = TableWidget(paginated_bf_df) + initial_row_count = widget.row_count + + # Change a global option that could influence row count + bf.options.display.max_rows = 10 + + # Verify the row count remains immutable. + assert widget.row_count == initial_row_count + + +class FaultyIterator: + def __iter__(self): + return self + + def __next__(self): + raise ValueError("Simulated read error") + + +def test_widget_should_fallback_to_zero_rows_with_invlid_total_rows( + paginated_bf_df: bf.dataframe.DataFrame, + monkeypatch: pytest.MonkeyPatch, +): + """ + Given an internal component fails to return valid execution data, + when the TableWidget is created, its row_count should safely fall back to 0. + """ + # Patch the executor's 'execute' method to simulate an error. + monkeypatch.setattr( + "bigframes.session.bq_caching_executor.BigQueryCachingExecutor.execute", + lambda self, *args, **kwargs: mock_execute_result_with_params( + self, paginated_bf_df._block.expr.schema, None, [], *args, **kwargs + ), + ) + + # Create the TableWidget under the error condition. + with bf.option_context("display.repr_mode", "anywidget"): + from bigframes.display import TableWidget + + # The widget should handle the faulty data from the mock without crashing. + widget = TableWidget(paginated_bf_df) + + # The widget safely defaults to 0 rows. + assert widget.row_count == 0 + + +def test_widget_row_count_reflects_actual_data_available( + paginated_bf_df: bf.dataframe.DataFrame, +): + """ + Test that widget row_count reflects the actual data available, + regardless of theoretical limits. + """ + from bigframes.display import TableWidget + + # Set up display options that define a page size. + with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): + widget = TableWidget(paginated_bf_df) + + # The widget should report the total rows in the DataFrame, + # not limited by page_size (which only affects pagination) + assert widget.row_count == EXPECTED_ROW_COUNT + assert widget.page_size == 2 # Respects the display option + + # TODO(shuowei): Add tests for custom index and multiindex # This may not be necessary for the SQL Cell use case but should be # considered for completeness.