Skip to content

Commit e43636b

Browse files
committed
Merge remote-tracking branch 'origin/main' into create-model-support
2 parents 84f4427 + 52665fa commit e43636b

File tree

15 files changed

+849
-162
lines changed

15 files changed

+849
-162
lines changed

.librarian/state.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:c8612d3fffb3f6a32353b2d1abd16b61e87811866f7ec9d65b59b02eb452a620
22
libraries:
33
- id: bigframes
4-
version: 2.28.0
4+
version: 2.29.1
55
apis: []
66
source_roots:
77
- .

bigframes/core/block_transforms.py

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -67,40 +67,39 @@ def indicate_duplicates(
6767
if keep not in ["first", "last", False]:
6868
raise ValueError("keep must be one of 'first', 'last', or False'")
6969

70+
rownums = agg_expressions.WindowExpression(
71+
agg_expressions.NullaryAggregation(
72+
agg_ops.RowNumberOp(),
73+
),
74+
window=windows.unbound(grouping_keys=tuple(columns)),
75+
)
76+
count = agg_expressions.WindowExpression(
77+
agg_expressions.NullaryAggregation(
78+
agg_ops.SizeOp(),
79+
),
80+
window=windows.unbound(grouping_keys=tuple(columns)),
81+
)
82+
7083
if keep == "first":
7184
# Count how many copies occur up to current copy of value
7285
# Discard this value if there are copies BEFORE
73-
window_spec = windows.cumulative_rows(
74-
grouping_keys=tuple(columns),
75-
)
86+
predicate = ops.gt_op.as_expr(rownums, ex.const(0))
7687
elif keep == "last":
7788
# Count how many copies occur up to current copy of values
7889
# Discard this value if there are copies AFTER
79-
window_spec = windows.inverse_cumulative_rows(
80-
grouping_keys=tuple(columns),
81-
)
90+
predicate = ops.lt_op.as_expr(rownums, ops.sub_op.as_expr(count, ex.const(1)))
8291
else: # keep == False
8392
# Count how many copies of the value occur in entire series.
8493
# Discard this value if there are copies ANYWHERE
85-
window_spec = windows.unbound(grouping_keys=tuple(columns))
86-
block, dummy = block.create_constant(1)
87-
# use row number as will work even with partial ordering
88-
block, val_count_col_id = block.apply_window_op(
89-
dummy,
90-
agg_ops.sum_op,
91-
window_spec=window_spec,
92-
)
93-
block, duplicate_indicator = block.project_expr(
94-
ops.gt_op.as_expr(val_count_col_id, ex.const(1))
94+
predicate = ops.gt_op.as_expr(count, ex.const(1))
95+
96+
block = block.project_block_exprs(
97+
[predicate],
98+
labels=[None],
9599
)
96100
return (
97-
block.drop_columns(
98-
(
99-
dummy,
100-
val_count_col_id,
101-
)
102-
),
103-
duplicate_indicator,
101+
block,
102+
block.value_columns[-1],
104103
)
105104

106105

bigframes/core/compile/polars/compiler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,9 @@ def compile_agg_op(
547547
return pl.col(*inputs).first()
548548
if isinstance(op, agg_ops.LastOp):
549549
return pl.col(*inputs).last()
550+
if isinstance(op, agg_ops.RowNumberOp):
551+
# pl.row_index is not yet stable enough to use here, and only supports polars>=1.32
552+
return pl.int_range(pl.len(), dtype=pl.Int64)
550553
if isinstance(op, agg_ops.ShiftOp):
551554
return pl.col(*inputs).shift(op.periods)
552555
if isinstance(op, agg_ops.DiffOp):

bigframes/core/indexes/base.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -376,9 +376,7 @@ def __repr__(self) -> __builtins__.str:
376376
# metadata, like we do with DataFrame.
377377
opts = bigframes.options.display
378378
max_results = opts.max_rows
379-
# anywdiget mode uses the same display logic as the "deferred" mode
380-
# for faster execution
381-
if opts.repr_mode in ("deferred", "anywidget"):
379+
if opts.repr_mode == "deferred":
382380
_, dry_run_query_job = self._block._compute_dry_run()
383381
return formatter.repr_query_job(dry_run_query_job)
384382

@@ -626,8 +624,6 @@ def dropna(self, how: typing.Literal["all", "any"] = "any") -> Index:
626624
return Index(result)
627625

628626
def drop_duplicates(self, *, keep: __builtins__.str = "first") -> Index:
629-
if keep is not False:
630-
validations.enforce_ordered(self, "drop_duplicates")
631627
block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep)
632628
return Index(block)
633629

bigframes/dataframe.py

Lines changed: 113 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -789,9 +789,7 @@ def __repr__(self) -> str:
789789

790790
opts = bigframes.options.display
791791
max_results = opts.max_rows
792-
# anywdiget mode uses the same display logic as the "deferred" mode
793-
# for faster execution
794-
if opts.repr_mode in ("deferred", "anywidget"):
792+
if opts.repr_mode == "deferred":
795793
return formatter.repr_query_job(self._compute_dry_run())
796794

797795
# TODO(swast): pass max_columns and get the true column count back. Maybe
@@ -829,68 +827,138 @@ def __repr__(self) -> str:
829827
lines.append(f"[{row_count} rows x {column_count} columns]")
830828
return "\n".join(lines)
831829

832-
def _repr_html_(self) -> str:
833-
"""
834-
Returns an html string primarily for use by notebooks for displaying
835-
a representation of the DataFrame. Displays 20 rows by default since
836-
many notebooks are not configured for large tables.
837-
"""
838-
opts = bigframes.options.display
839-
max_results = opts.max_rows
840-
if opts.repr_mode == "deferred":
841-
return formatter.repr_query_job(self._compute_dry_run())
842-
843-
# Process blob columns first, regardless of display mode
844-
self._cached()
845-
df = self.copy()
830+
def _get_display_df_and_blob_cols(self) -> tuple[DataFrame, list[str]]:
831+
"""Process blob columns for display."""
832+
df = self
833+
blob_cols = []
846834
if bigframes.options.display.blob_display:
847835
blob_cols = [
848836
series_name
849-
for series_name, series in df.items()
837+
for series_name, series in self.items()
850838
if series.dtype == bigframes.dtypes.OBJ_REF_DTYPE
851839
]
852-
for col in blob_cols:
853-
# TODO(garrettwu): Not necessary to get access urls for all the rows. Update when having a to get URLs from local data.
854-
df[col] = df[col].blob._get_runtime(mode="R", with_metadata=True)
840+
if blob_cols:
841+
df = self.copy()
842+
for col in blob_cols:
843+
# TODO(garrettwu): Not necessary to get access urls for all the rows. Update when having a to get URLs from local data.
844+
df[col] = df[col].blob._get_runtime(mode="R", with_metadata=True)
845+
return df, blob_cols
846+
847+
def _get_anywidget_bundle(self, include=None, exclude=None):
848+
"""
849+
Helper method to create and return the anywidget mimebundle.
850+
This function encapsulates the logic for anywidget display.
851+
"""
852+
from bigframes import display
853+
854+
# TODO(shuowei): Keep blob_cols and pass them to TableWidget so that they can render properly.
855+
df, _ = self._get_display_df_and_blob_cols()
856+
857+
# Create and display the widget
858+
widget = display.TableWidget(df)
859+
widget_repr_result = widget._repr_mimebundle_(include=include, exclude=exclude)
860+
861+
# Handle both tuple (data, metadata) and dict returns
862+
if isinstance(widget_repr_result, tuple):
863+
widget_repr = dict(widget_repr_result[0]) # Extract data dict from tuple
855864
else:
856-
blob_cols = []
865+
widget_repr = dict(widget_repr_result)
857866

858-
if opts.repr_mode == "anywidget":
859-
try:
860-
from IPython.display import display as ipython_display
867+
# At this point, we have already executed the query as part of the
868+
# widget construction. Let's use the information available to render
869+
# the HTML and plain text versions.
870+
widget_repr["text/html"] = widget.table_html
871+
872+
widget_repr["text/plain"] = self._create_text_representation(
873+
widget._cached_data, widget.row_count
874+
)
875+
876+
return widget_repr
877+
878+
def _create_text_representation(
879+
self, pandas_df: pandas.DataFrame, total_rows: typing.Optional[int]
880+
) -> str:
881+
"""Create a text representation of the DataFrame."""
882+
opts = bigframes.options.display
883+
with display_options.pandas_repr(opts):
884+
import pandas.io.formats
885+
886+
# safe to mutate this, this dict is owned by this code, and does not affect global config
887+
to_string_kwargs = (
888+
pandas.io.formats.format.get_dataframe_repr_params() # type: ignore
889+
)
890+
if not self._has_index:
891+
to_string_kwargs.update({"index": False})
892+
893+
# We add our own dimensions string, so don't want pandas to.
894+
to_string_kwargs.update({"show_dimensions": False})
895+
repr_string = pandas_df.to_string(**to_string_kwargs)
861896

862-
from bigframes import display
897+
lines = repr_string.split("\n")
863898

864-
# Always create a new widget instance for each display call
865-
# This ensures that each cell gets its own widget and prevents
866-
# unintended sharing between cells
867-
widget = display.TableWidget(df.copy())
899+
if total_rows is not None and total_rows > len(pandas_df):
900+
lines.append("...")
868901

869-
ipython_display(widget)
870-
return "" # Return empty string since we used display()
902+
lines.append("")
903+
column_count = len(self.columns)
904+
lines.append(f"[{total_rows or '?'} rows x {column_count} columns]")
905+
return "\n".join(lines)
871906

872-
except (AttributeError, ValueError, ImportError):
873-
# Fallback if anywidget is not available
907+
def _repr_mimebundle_(self, include=None, exclude=None):
908+
"""
909+
Custom display method for IPython/Jupyter environments.
910+
This is called by IPython's display system when the object is displayed.
911+
"""
912+
opts = bigframes.options.display
913+
# Only handle widget display in anywidget mode
914+
if opts.repr_mode == "anywidget":
915+
try:
916+
return self._get_anywidget_bundle(include=include, exclude=exclude)
917+
918+
except ImportError:
919+
# Anywidget is an optional dependency, so warn rather than fail.
920+
# TODO(shuowei): When Anywidget becomes the default for all repr modes,
921+
# remove this warning.
874922
warnings.warn(
875923
"Anywidget mode is not available. "
876924
"Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. "
877-
f"Falling back to deferred mode. Error: {traceback.format_exc()}"
925+
f"Falling back to static HTML. Error: {traceback.format_exc()}"
878926
)
879-
return formatter.repr_query_job(self._compute_dry_run())
880927

881-
# Continue with regular HTML rendering for non-anywidget modes
882-
# TODO(swast): pass max_columns and get the true column count back. Maybe
883-
# get 1 more column than we have requested so that pandas can add the
884-
# ... for us?
928+
# In non-anywidget mode, fetch data once and use it for both HTML
929+
# and plain text representations to avoid multiple queries.
930+
opts = bigframes.options.display
931+
max_results = opts.max_rows
932+
933+
df, blob_cols = self._get_display_df_and_blob_cols()
934+
885935
pandas_df, row_count, query_job = df._block.retrieve_repr_request_results(
886936
max_results
887937
)
888-
889938
self._set_internal_query_job(query_job)
890939
column_count = len(pandas_df.columns)
891940

941+
html_string = self._create_html_representation(
942+
pandas_df, row_count, column_count, blob_cols
943+
)
944+
945+
text_representation = self._create_text_representation(pandas_df, row_count)
946+
947+
return {"text/html": html_string, "text/plain": text_representation}
948+
949+
def _create_html_representation(
950+
self,
951+
pandas_df: pandas.DataFrame,
952+
row_count: int,
953+
column_count: int,
954+
blob_cols: list[str],
955+
) -> str:
956+
"""Create an HTML representation of the DataFrame."""
957+
opts = bigframes.options.display
892958
with display_options.pandas_repr(opts):
893-
# Allows to preview images in the DataFrame. The implementation changes the string repr as well, that it doesn't truncate strings or escape html charaters such as "<" and ">". We may need to implement a full-fledged repr module to better support types not in pandas.
959+
# TODO(shuowei, b/464053870): Escaping HTML would be useful, but
960+
# `escape=False` is needed to show images. We may need to implement
961+
# a full-fledged repr module to better support types not in pandas.
894962
if bigframes.options.display.blob_display and blob_cols:
895963

896964
def obj_ref_rt_to_html(obj_ref_rt) -> str:
@@ -919,15 +987,12 @@ def obj_ref_rt_to_html(obj_ref_rt) -> str:
919987

920988
# set max_colwidth so not to truncate the image url
921989
with pandas.option_context("display.max_colwidth", None):
922-
max_rows = pandas.get_option("display.max_rows")
923-
max_cols = pandas.get_option("display.max_columns")
924-
show_dimensions = pandas.get_option("display.show_dimensions")
925990
html_string = pandas_df.to_html(
926991
escape=False,
927992
notebook=True,
928-
max_rows=max_rows,
929-
max_cols=max_cols,
930-
show_dimensions=show_dimensions,
993+
max_rows=pandas.get_option("display.max_rows"),
994+
max_cols=pandas.get_option("display.max_columns"),
995+
show_dimensions=pandas.get_option("display.show_dimensions"),
931996
formatters=formatters, # type: ignore
932997
)
933998
else:
@@ -4989,8 +5054,6 @@ def drop_duplicates(
49895054
*,
49905055
keep: str = "first",
49915056
) -> DataFrame:
4992-
if keep is not False:
4993-
validations.enforce_ordered(self, "drop_duplicates(keep != False)")
49945057
if subset is None:
49955058
column_ids = self._block.value_columns
49965059
elif utils.is_list_like(subset):
@@ -5004,8 +5067,6 @@ def drop_duplicates(
50045067
return DataFrame(block)
50055068

50065069
def duplicated(self, subset=None, keep: str = "first") -> bigframes.series.Series:
5007-
if keep is not False:
5008-
validations.enforce_ordered(self, "duplicated(keep != False)")
50095070
if subset is None:
50105071
column_ids = self._block.value_columns
50115072
else:

bigframes/series.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2227,8 +2227,6 @@ def reindex_like(self, other: Series, *, validate: typing.Optional[bool] = None)
22272227
return self.reindex(other.index, validate=validate)
22282228

22292229
def drop_duplicates(self, *, keep: str = "first") -> Series:
2230-
if keep is not False:
2231-
validations.enforce_ordered(self, "drop_duplicates(keep != False)")
22322230
block = block_ops.drop_duplicates(self._block, (self._value_column,), keep)
22332231
return Series(block)
22342232

@@ -2249,8 +2247,6 @@ def unique(self, keep_order=True) -> Series:
22492247
return Series(block.select_columns(result).reset_index())
22502248

22512249
def duplicated(self, keep: str = "first") -> Series:
2252-
if keep is not False:
2253-
validations.enforce_ordered(self, "duplicated(keep != False)")
22542250
block, indicator = block_ops.indicate_duplicates(
22552251
self._block, (self._value_column,), keep
22562252
)

bigframes/streaming/dataframe.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -291,13 +291,13 @@ def __repr__(self, *args, **kwargs):
291291

292292
__repr__.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.__repr__))
293293

294-
def _repr_html_(self, *args, **kwargs):
295-
return _return_type_wrapper(self._df._repr_html_, StreamingDataFrame)(
294+
def _repr_mimebundle_(self, *args, **kwargs):
295+
return _return_type_wrapper(self._df._repr_mimebundle_, StreamingDataFrame)(
296296
*args, **kwargs
297297
)
298298

299-
_repr_html_.__doc__ = _curate_df_doc(
300-
inspect.getdoc(dataframe.DataFrame._repr_html_)
299+
_repr_mimebundle_.__doc__ = _curate_df_doc(
300+
inspect.getdoc(dataframe.DataFrame._repr_mimebundle_)
301301
)
302302

303303
@property

docs/conf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,9 @@
172172
"url": "https://docs.cloud.google.com/bigquery/docs/bigquery-dataframes-introduction",
173173
},
174174
],
175+
"analytics": {
176+
"google_analytics_id": "G-XVSRMCJ37X",
177+
},
175178
}
176179

177180
# Add any paths that contain custom themes here, relative to this directory.

0 commit comments

Comments
 (0)