Skip to content

perf: Replace expensive len() call with PandasBatches.total_rows in anywidget TableWidget #1937

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
48 changes: 29 additions & 19 deletions bigframes/display/anywidget.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@
from importlib import resources
import functools
import math
from typing import Any, Dict, Iterator, List, Optional, Type
import typing
from typing import Any, cast, Dict, Iterator, List, Optional, Type
import uuid

import pandas as pd

import bigframes
import bigframes.core.blocks
import bigframes.display.html

# anywidget and traitlets are optional dependencies. We don't want the import of this
Expand All @@ -45,8 +47,10 @@


class TableWidget(WIDGET_BASE):
"""
An interactive, paginated table widget for BigFrames DataFrames.
"""An interactive, paginated table widget for BigFrames DataFrames.

This widget provides a user-friendly way to display and navigate through
large BigQuery DataFrames within a Jupyter environment.
"""

def __init__(self, dataframe: bigframes.dataframe.DataFrame):
Expand All @@ -63,28 +67,31 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
super().__init__()
self._dataframe = dataframe

# Initialize attributes that might be needed by observers FIRST
# Initialize attributes that might be needed by observers first
self._table_id = str(uuid.uuid4())
self._all_data_loaded = False
self._batch_iter: Optional[Iterator[pd.DataFrame]] = None
self._cached_batches: List[pd.DataFrame] = []

# respect display options for initial page size
# Respect display options for initial page size
initial_page_size = bigframes.options.display.max_rows

# Initialize data fetching attributes.
self._batches = dataframe.to_pandas_batches(page_size=initial_page_size)
batches = dataframe.to_pandas_batches(
page_size=initial_page_size,
)
self._batches: bigframes.core.blocks.PandasBatches = cast(
bigframes.core.blocks.PandasBatches, batches
)

# The query issued by `to_pandas_batches()` already contains metadata
# about how many results there were. Use that to avoid doing an extra
# COUNT(*) query that `len(...)` would do.
self.row_count = self._batches.total_rows or 0

# set traitlets properties that trigger observers
# Set page_size after _batches is available since traitlets observers
# may depend on _batches being initialized when the change trigger happens
self.page_size = initial_page_size

# len(dataframe) is expensive, since it will trigger a
# SELECT COUNT(*) query. It is a must have however.
# TODO(b/428238610): Start iterating over the result of `to_pandas_batches()`
# before we get here so that the count might already be cached.
self.row_count = len(dataframe)

# get the initial page
self._set_table_html()

@functools.cached_property
Expand Down Expand Up @@ -160,15 +167,17 @@ def _get_next_batch(self) -> bool:
batch = next(iterator)
self._cached_batches.append(batch)
return True
except StopIteration:
except StopIteration as e:
self._all_data_loaded = True
if not isinstance(e, StopIteration):
# If we fail to get a batch, assume no more data is available.
self.row_count = 0
return False

@property
def _batch_iterator(self) -> Iterator[pd.DataFrame]:
"""Lazily initializes and returns the batch iterator."""
if self._batch_iter is None:
self._batch_iter = iter(self._batches)
self._batch_iter = iter(self._batches)
return self._batch_iter

@property
Expand All @@ -180,7 +189,8 @@ def _cached_data(self) -> pd.DataFrame:

def _reset_batches_for_new_page_size(self):
"""Reset the batch iterator when page size changes."""
self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size)
batches = self._dataframe.to_pandas_batches(page_size=self.page_size)
self._batches = typing.cast(bigframes.core.blocks.PandasBatches, batches)
self._cached_batches = []
self._batch_iter = None
self._all_data_loaded = False
Expand Down
20 changes: 16 additions & 4 deletions notebooks/dataframes/anywidget_mode.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
{
"data": {
"text/html": [
"Query job a643d120-4af9-44fc-ba3c-ed461cf1092b is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:a643d120-4af9-44fc-ba3c-ed461cf1092b&page=queryresults\">Open Job</a>"
"Query job 1ea2b594-2bd7-46de-a3c8-6aeee5884ba2 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:1ea2b594-2bd7-46de-a3c8-6aeee5884ba2&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand Down Expand Up @@ -139,10 +139,22 @@
"id": "ce250157",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Query job 67e679e9-94da-47f7-8be1-8b4a496fbfbd is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:67e679e9-94da-47f7-8be1-8b4a496fbfbd&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d2d4ef22ea9f414b89ea5bd85f0e6635",
"model_id": "e74c3920b93644a0b2afdaa3841cad31",
"version_major": 2,
"version_minor": 1
},
Expand Down Expand Up @@ -193,7 +205,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "121e3d2f28004036a922e3a11a08d4b7",
"model_id": "b4f7a3f86ef54e07b24ef10061088391",
"version_major": 2,
"version_minor": 1
},
Expand Down Expand Up @@ -287,7 +299,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5ed335bbbc064e5391ea06a9a218642e",
"model_id": "44a829aca2f24cfdba4b61afd1a259fe",
"version_major": 2,
"version_minor": 1
},
Expand Down
4 changes: 2 additions & 2 deletions notebooks/dataframes/dataframe.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -5366,7 +5366,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "venv",
"language": "python",
"name": "python3"
},
Expand All @@ -5380,7 +5380,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
"version": "3.10.15"
}
},
"nbformat": 4,
Expand Down
10 changes: 6 additions & 4 deletions tests/benchmark/read_gbq_colab/aggregate_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib
import typing

import benchmark.utils as utils

Expand All @@ -26,8 +27,9 @@ def aggregate_output(*, project_id, dataset_id, table_id):
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")

# Simulate getting the first page, since we'll always do that first in the UI.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
assert typing.cast(typing.Any, batches).total_rows >= 0
next(iter(batches))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add an assertion that references batches.total_rows. We want to mimic TableWidget as closely as we can.

For example:

Suggested change
next(iter(batches))
assert batches.total_rows >= 0
next(iter(batches))

Same for the other benchmarks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change will trigger a mypy error, to_pandas_batches() actually returns a PandasBatches object that has the total_rows attribute, but the type annotations show it as Iterable[pandas.DataFrame]. I can assert here after casting


# To simulate very small rows that can only fit a boolean,
# some tables don't have an integer column. If an integer column is available,
Expand All @@ -43,8 +45,8 @@ def aggregate_output(*, project_id, dataset_id, table_id):
.sum(numeric_only=True)
)

df_aggregated.shape
next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)))
batches_aggregated = df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)
next(iter(batches_aggregated))


if __name__ == "__main__":
Expand Down
16 changes: 11 additions & 5 deletions tests/benchmark/read_gbq_colab/filter_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib
import typing

import benchmark.utils as utils

import bigframes.core.blocks
import bigframes.pandas as bpd

PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
Expand All @@ -31,16 +33,20 @@ def filter_output(
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")

# Simulate getting the first page, since we'll always do that first in the UI.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
next(iter(batches))

# Simulate the user filtering by a column and visualizing those results
df_filtered = df[df["col_bool_0"]]
rows, _ = df_filtered.shape

batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE)
batches_filtered = typing.cast(
bigframes.core.blocks.PandasBatches, batches_filtered
)
rows = batches_filtered.total_rows
assert rows >= 0
# It's possible we don't have any pages at all, since we filtered out all
# matching rows.
first_page = next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
first_page = next(iter(batches_filtered))
assert len(first_page.index) <= rows


Expand Down
7 changes: 5 additions & 2 deletions tests/benchmark/read_gbq_colab/first_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib
import typing

import benchmark.utils as utils

Expand All @@ -28,8 +29,10 @@ def first_page(*, project_id, dataset_id, table_id):
)

# Get number of rows (to calculate number of pages) and the first page.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
assert typing.cast(typing.Any, batches).total_rows >= 0
first_page = next(iter(batches))
assert first_page is not None


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions tests/benchmark/read_gbq_colab/last_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def last_page(*, project_id, dataset_id, table_id):
)

# Get number of rows (to calculate number of pages) and then all pages.
df.shape
for _ in df.to_pandas_batches(page_size=PAGE_SIZE):
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
for _ in batches:
pass


Expand Down
11 changes: 7 additions & 4 deletions tests/benchmark/read_gbq_colab/sort_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib
import typing

import benchmark.utils as utils

Expand All @@ -28,17 +29,19 @@ def sort_output(*, project_id, dataset_id, table_id):
)

# Simulate getting the first page, since we'll always do that first in the UI.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
assert typing.cast(typing.Any, batches).total_rows >= 0
next(iter(batches))

# Simulate the user sorting by a column and visualizing those results
sort_column = "col_int64_1"
if sort_column not in df.columns:
sort_column = "col_bool_0"

df_sorted = df.sort_values(sort_column)
df_sorted.shape
next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE)))
batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE)
assert typing.cast(typing.Any, batches_sorted).total_rows >= 0
next(iter(batches_sorted))


if __name__ == "__main__":
Expand Down
Loading