Skip to content

Commit 99149aa

Browse files
authored
Merge branch 'main' into benchmark_metric_huanc
2 parents 086ea2d + ba0d23b commit 99149aa

File tree

17 files changed

+551
-9
lines changed

17 files changed

+551
-9
lines changed

bigframes/core/compile/sqlglot/expressions/unary_compiler.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from __future__ import annotations
1616

17+
import functools
1718
import typing
1819

1920
import pandas as pd
@@ -292,6 +293,18 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
292293
return sge.Extract(this=sge.Identifier(this="DAYOFYEAR"), expression=expr.expr)
293294

294295

296+
@UNARY_OP_REGISTRATION.register(ops.EndsWithOp)
297+
def _(op: ops.EndsWithOp, expr: TypedExpr) -> sge.Expression:
298+
if not op.pat:
299+
return sge.false()
300+
301+
def to_endswith(pat: str) -> sge.Expression:
302+
return sge.func("ENDS_WITH", expr.expr, sge.convert(pat))
303+
304+
conditions = [to_endswith(pat) for pat in op.pat]
305+
return functools.reduce(lambda x, y: sge.Or(this=x, expression=y), conditions)
306+
307+
295308
@UNARY_OP_REGISTRATION.register(ops.exp_op)
296309
def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
297310
return sge.Case(
@@ -633,6 +646,18 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
633646
)
634647

635648

649+
@UNARY_OP_REGISTRATION.register(ops.StartsWithOp)
650+
def _(op: ops.StartsWithOp, expr: TypedExpr) -> sge.Expression:
651+
if not op.pat:
652+
return sge.false()
653+
654+
def to_startswith(pat: str) -> sge.Expression:
655+
return sge.func("STARTS_WITH", expr.expr, sge.convert(pat))
656+
657+
conditions = [to_startswith(pat) for pat in op.pat]
658+
return functools.reduce(lambda x, y: sge.Or(this=x, expression=y), conditions)
659+
660+
636661
@UNARY_OP_REGISTRATION.register(ops.StrStripOp)
637662
def _(op: ops.StrStripOp, expr: TypedExpr) -> sge.Expression:
638663
return sge.Trim(this=sge.convert(op.to_strip), expression=expr.expr)
@@ -656,6 +681,11 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
656681
)
657682

658683

684+
@UNARY_OP_REGISTRATION.register(ops.StringSplitOp)
685+
def _(op: ops.StringSplitOp, expr: TypedExpr) -> sge.Expression:
686+
return sge.Split(this=expr.expr, expression=sge.convert(op.pat))
687+
688+
659689
@UNARY_OP_REGISTRATION.register(ops.StrGetOp)
660690
def _(op: ops.StrGetOp, expr: TypedExpr) -> sge.Expression:
661691
return sge.Substring(
@@ -808,3 +838,31 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
808838
@UNARY_OP_REGISTRATION.register(ops.year_op)
809839
def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression:
810840
return sge.Extract(this=sge.Identifier(this="YEAR"), expression=expr.expr)
841+
842+
843+
@UNARY_OP_REGISTRATION.register(ops.ZfillOp)
844+
def _(op: ops.ZfillOp, expr: TypedExpr) -> sge.Expression:
845+
return sge.Case(
846+
ifs=[
847+
sge.If(
848+
this=sge.EQ(
849+
this=sge.Substring(
850+
this=expr.expr, start=sge.convert(1), length=sge.convert(1)
851+
),
852+
expression=sge.convert("-"),
853+
),
854+
true=sge.Concat(
855+
expressions=[
856+
sge.convert("-"),
857+
sge.func(
858+
"LPAD",
859+
sge.Substring(this=expr.expr, start=sge.convert(1)),
860+
sge.convert(op.width - 1),
861+
sge.convert("0"),
862+
),
863+
]
864+
),
865+
)
866+
],
867+
default=sge.func("LPAD", expr.expr, sge.convert(op.width), sge.convert("0")),
868+
)

bigframes/dataframe.py

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import traceback
2727
import typing
2828
from typing import (
29+
Any,
2930
Callable,
3031
Dict,
3132
Hashable,
@@ -91,6 +92,7 @@
9192
import bigframes.session
9293

9394
SingleItemValue = Union[bigframes.series.Series, int, float, str, Callable]
95+
MultiItemValue = Union["DataFrame", Sequence[int | float | str | Callable]]
9496

9597
LevelType = typing.Hashable
9698
LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]
@@ -884,8 +886,13 @@ def __delitem__(self, key: str):
884886
df = self.drop(columns=[key])
885887
self._set_block(df._get_block())
886888

887-
def __setitem__(self, key: str, value: SingleItemValue):
888-
df = self._assign_single_item(key, value)
889+
def __setitem__(
890+
self, key: str | list[str], value: SingleItemValue | MultiItemValue
891+
):
892+
if isinstance(key, list):
893+
df = self._assign_multi_items(key, value)
894+
else:
895+
df = self._assign_single_item(key, value)
889896
self._set_block(df._get_block())
890897

891898
__setitem__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__setitem__)
@@ -2212,7 +2219,7 @@ def assign(self, **kwargs) -> DataFrame:
22122219
def _assign_single_item(
22132220
self,
22142221
k: str,
2215-
v: SingleItemValue,
2222+
v: SingleItemValue | MultiItemValue,
22162223
) -> DataFrame:
22172224
if isinstance(v, bigframes.series.Series):
22182225
return self._assign_series_join_on_index(k, v)
@@ -2230,7 +2237,33 @@ def _assign_single_item(
22302237
elif utils.is_list_like(v):
22312238
return self._assign_single_item_listlike(k, v)
22322239
else:
2233-
return self._assign_scalar(k, v)
2240+
return self._assign_scalar(k, v) # type: ignore
2241+
2242+
def _assign_multi_items(
2243+
self,
2244+
k: list[str],
2245+
v: SingleItemValue | MultiItemValue,
2246+
) -> DataFrame:
2247+
value_sources: Sequence[Any] = []
2248+
if isinstance(v, DataFrame):
2249+
value_sources = [v[col] for col in v.columns]
2250+
elif isinstance(v, bigframes.series.Series):
2251+
# For behavior consistency with Pandas.
2252+
raise ValueError("Columns must be same length as key")
2253+
elif isinstance(v, Sequence):
2254+
value_sources = v
2255+
else:
2256+
# We assign the same scalar value to all target columns.
2257+
value_sources = [v] * len(k)
2258+
2259+
if len(value_sources) != len(k):
2260+
raise ValueError("Columns must be same length as key")
2261+
2262+
# Repeatedly assign columns in order.
2263+
result = self._assign_single_item(k[0], value_sources[0])
2264+
for target, source in zip(k[1:], value_sources[1:]):
2265+
result = result._assign_single_item(target, source)
2266+
return result
22342267

22352268
def _assign_single_item_listlike(self, k: str, v: Sequence) -> DataFrame:
22362269
given_rows = len(v)

bigframes/ml/model_selection.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919

2020
import inspect
21+
from itertools import chain
2122
import time
2223
from typing import cast, Generator, List, Optional, Union
2324

@@ -36,12 +37,9 @@ def train_test_split(
3637
train_size: Union[float, None] = None,
3738
random_state: Union[int, None] = None,
3839
stratify: Union[bpd.Series, None] = None,
40+
shuffle: bool = True,
3941
) -> List[Union[bpd.DataFrame, bpd.Series]]:
4042

41-
# TODO(garrettwu): scikit-learn throws an error when the dataframes don't have the same
42-
# number of rows. We probably want to do something similar. Now the implementation is based
43-
# on index. We'll move to based on ordering first.
44-
4543
if test_size is None:
4644
if train_size is None:
4745
test_size = 0.25
@@ -61,6 +59,26 @@ def train_test_split(
6159
f"The sum of train_size and test_size exceeds 1.0. train_size: {train_size}. test_size: {test_size}"
6260
)
6361

62+
if not shuffle:
63+
if stratify is not None:
64+
raise ValueError(
65+
"Stratified train/test split is not implemented for shuffle=False"
66+
)
67+
bf_arrays = list(utils.batch_convert_to_bf_equivalent(*arrays))
68+
69+
total_rows = len(bf_arrays[0])
70+
train_rows = int(total_rows * train_size)
71+
test_rows = total_rows - train_rows
72+
73+
return list(
74+
chain.from_iterable(
75+
[
76+
[bf_array.head(train_rows), bf_array.tail(test_rows)]
77+
for bf_array in bf_arrays
78+
]
79+
)
80+
)
81+
6482
dfs = list(utils.batch_convert_to_dataframe(*arrays))
6583

6684
def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFrame]:

bigframes/ml/utils.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,30 @@ def batch_convert_to_series(
7979
)
8080

8181

82+
def batch_convert_to_bf_equivalent(
83+
*input: ArrayType, session: Optional[Session] = None
84+
) -> Generator[Union[bpd.DataFrame, bpd.Series], None, None]:
85+
"""Converts the input to BigFrames DataFrame or Series.
86+
87+
Args:
88+
session:
89+
The session to convert local pandas instances to BigFrames counter-parts.
90+
It is not used if the input itself is already a BigFrame data frame or series.
91+
92+
"""
93+
_validate_sessions(*input, session=session)
94+
95+
for frame in input:
96+
if isinstance(frame, bpd.DataFrame) or isinstance(frame, pd.DataFrame):
97+
yield convert.to_bf_dataframe(frame, default_index=None, session=session)
98+
elif isinstance(frame, bpd.Series) or isinstance(frame, pd.Series):
99+
yield convert.to_bf_series(
100+
_get_only_column(frame), default_index=None, session=session
101+
)
102+
else:
103+
raise ValueError(f"Unsupported type: {type(frame)}")
104+
105+
82106
def _validate_sessions(*input: ArrayType, session: Optional[Session]):
83107
session_ids = set(
84108
i._session.session_id

tests/system/small/ml/test_model_selection.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@
1313
# limitations under the License.
1414

1515
import math
16+
from typing import cast
1617

1718
import pandas as pd
1819
import pytest
1920

2021
from bigframes.ml import model_selection
2122
import bigframes.pandas as bpd
23+
import bigframes.session
2224

2325

2426
@pytest.mark.parametrize(
@@ -219,6 +221,78 @@ def test_train_test_split_seeded_correct_rows(
219221
)
220222

221223

224+
def test_train_test_split_no_shuffle_correct_shape(
225+
penguins_df_default_index: bpd.DataFrame,
226+
):
227+
X = penguins_df_default_index[["species"]]
228+
y = penguins_df_default_index["body_mass_g"]
229+
X_train, X_test, y_train, y_test = model_selection.train_test_split(
230+
X, y, shuffle=False
231+
)
232+
assert isinstance(X_train, bpd.DataFrame)
233+
assert isinstance(X_test, bpd.DataFrame)
234+
assert isinstance(y_train, bpd.Series)
235+
assert isinstance(y_test, bpd.Series)
236+
237+
assert X_train.shape == (258, 1)
238+
assert X_test.shape == (86, 1)
239+
assert y_train.shape == (258,)
240+
assert y_test.shape == (86,)
241+
242+
243+
def test_train_test_split_no_shuffle_correct_rows(
244+
session: bigframes.session.Session, penguins_pandas_df_default_index: bpd.DataFrame
245+
):
246+
# Note that we're using `penguins_pandas_df_default_index` as this test depends
247+
# on a stable row order being present end to end
248+
# filter down to the chunkiest penguins, to keep our test code a reasonable size
249+
all_data = penguins_pandas_df_default_index[
250+
penguins_pandas_df_default_index.body_mass_g > 5500
251+
].sort_index()
252+
253+
# Note that bigframes loses the index if it doesn't have a name
254+
all_data.index.name = "rowindex"
255+
256+
df = session.read_pandas(all_data)
257+
258+
X = df[
259+
[
260+
"species",
261+
"island",
262+
"culmen_length_mm",
263+
]
264+
]
265+
y = df["body_mass_g"]
266+
X_train, X_test, y_train, y_test = model_selection.train_test_split(
267+
X, y, shuffle=False
268+
)
269+
270+
X_train_pd = cast(bpd.DataFrame, X_train).to_pandas()
271+
X_test_pd = cast(bpd.DataFrame, X_test).to_pandas()
272+
y_train_pd = cast(bpd.Series, y_train).to_pandas()
273+
y_test_pd = cast(bpd.Series, y_test).to_pandas()
274+
275+
total_rows = len(all_data)
276+
train_size = 0.75
277+
train_rows = int(total_rows * train_size)
278+
test_rows = total_rows - train_rows
279+
280+
expected_X_train = all_data.head(train_rows)[
281+
["species", "island", "culmen_length_mm"]
282+
]
283+
expected_y_train = all_data.head(train_rows)["body_mass_g"]
284+
285+
expected_X_test = all_data.tail(test_rows)[
286+
["species", "island", "culmen_length_mm"]
287+
]
288+
expected_y_test = all_data.tail(test_rows)["body_mass_g"]
289+
290+
pd.testing.assert_frame_equal(X_train_pd, expected_X_train)
291+
pd.testing.assert_frame_equal(X_test_pd, expected_X_test)
292+
pd.testing.assert_series_equal(y_train_pd, expected_y_train)
293+
pd.testing.assert_series_equal(y_test_pd, expected_y_test)
294+
295+
222296
@pytest.mark.parametrize(
223297
("train_size", "test_size"),
224298
[

0 commit comments

Comments
 (0)