Skip to content

Commit 4776872

Browse files
authored
Merge branch 'main' into output_schema
2 parents 2f6df1d + ba0d23b commit 4776872

File tree

7 files changed

+323
-9
lines changed

7 files changed

+323
-9
lines changed

bigframes/dataframe.py

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import traceback
2727
import typing
2828
from typing import (
29+
Any,
2930
Callable,
3031
Dict,
3132
Hashable,
@@ -91,6 +92,7 @@
9192
import bigframes.session
9293

9394
SingleItemValue = Union[bigframes.series.Series, int, float, str, Callable]
95+
MultiItemValue = Union["DataFrame", Sequence[int | float | str | Callable]]
9496

9597
LevelType = typing.Hashable
9698
LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]
@@ -884,8 +886,13 @@ def __delitem__(self, key: str):
884886
df = self.drop(columns=[key])
885887
self._set_block(df._get_block())
886888

887-
def __setitem__(self, key: str, value: SingleItemValue):
888-
df = self._assign_single_item(key, value)
889+
def __setitem__(
890+
self, key: str | list[str], value: SingleItemValue | MultiItemValue
891+
):
892+
if isinstance(key, list):
893+
df = self._assign_multi_items(key, value)
894+
else:
895+
df = self._assign_single_item(key, value)
889896
self._set_block(df._get_block())
890897

891898
__setitem__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__setitem__)
@@ -2212,7 +2219,7 @@ def assign(self, **kwargs) -> DataFrame:
22122219
def _assign_single_item(
22132220
self,
22142221
k: str,
2215-
v: SingleItemValue,
2222+
v: SingleItemValue | MultiItemValue,
22162223
) -> DataFrame:
22172224
if isinstance(v, bigframes.series.Series):
22182225
return self._assign_series_join_on_index(k, v)
@@ -2230,7 +2237,33 @@ def _assign_single_item(
22302237
elif utils.is_list_like(v):
22312238
return self._assign_single_item_listlike(k, v)
22322239
else:
2233-
return self._assign_scalar(k, v)
2240+
return self._assign_scalar(k, v) # type: ignore
2241+
2242+
def _assign_multi_items(
2243+
self,
2244+
k: list[str],
2245+
v: SingleItemValue | MultiItemValue,
2246+
) -> DataFrame:
2247+
value_sources: Sequence[Any] = []
2248+
if isinstance(v, DataFrame):
2249+
value_sources = [v[col] for col in v.columns]
2250+
elif isinstance(v, bigframes.series.Series):
2251+
# For behavior consistency with Pandas.
2252+
raise ValueError("Columns must be same length as key")
2253+
elif isinstance(v, Sequence):
2254+
value_sources = v
2255+
else:
2256+
# We assign the same scalar value to all target columns.
2257+
value_sources = [v] * len(k)
2258+
2259+
if len(value_sources) != len(k):
2260+
raise ValueError("Columns must be same length as key")
2261+
2262+
# Repeatedly assign columns in order.
2263+
result = self._assign_single_item(k[0], value_sources[0])
2264+
for target, source in zip(k[1:], value_sources[1:]):
2265+
result = result._assign_single_item(target, source)
2266+
return result
22342267

22352268
def _assign_single_item_listlike(self, k: str, v: Sequence) -> DataFrame:
22362269
given_rows = len(v)

bigframes/ml/model_selection.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919

2020
import inspect
21+
from itertools import chain
2122
import time
2223
from typing import cast, Generator, List, Optional, Union
2324

@@ -36,12 +37,9 @@ def train_test_split(
3637
train_size: Union[float, None] = None,
3738
random_state: Union[int, None] = None,
3839
stratify: Union[bpd.Series, None] = None,
40+
shuffle: bool = True,
3941
) -> List[Union[bpd.DataFrame, bpd.Series]]:
4042

41-
# TODO(garrettwu): scikit-learn throws an error when the dataframes don't have the same
42-
# number of rows. We probably want to do something similar. Now the implementation is based
43-
# on index. We'll move to based on ordering first.
44-
4543
if test_size is None:
4644
if train_size is None:
4745
test_size = 0.25
@@ -61,6 +59,26 @@ def train_test_split(
6159
f"The sum of train_size and test_size exceeds 1.0. train_size: {train_size}. test_size: {test_size}"
6260
)
6361

62+
if not shuffle:
63+
if stratify is not None:
64+
raise ValueError(
65+
"Stratified train/test split is not implemented for shuffle=False"
66+
)
67+
bf_arrays = list(utils.batch_convert_to_bf_equivalent(*arrays))
68+
69+
total_rows = len(bf_arrays[0])
70+
train_rows = int(total_rows * train_size)
71+
test_rows = total_rows - train_rows
72+
73+
return list(
74+
chain.from_iterable(
75+
[
76+
[bf_array.head(train_rows), bf_array.tail(test_rows)]
77+
for bf_array in bf_arrays
78+
]
79+
)
80+
)
81+
6482
dfs = list(utils.batch_convert_to_dataframe(*arrays))
6583

6684
def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFrame]:

bigframes/ml/utils.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,30 @@ def batch_convert_to_series(
7979
)
8080

8181

82+
def batch_convert_to_bf_equivalent(
83+
*input: ArrayType, session: Optional[Session] = None
84+
) -> Generator[Union[bpd.DataFrame, bpd.Series], None, None]:
85+
"""Converts the input to BigFrames DataFrame or Series.
86+
87+
Args:
88+
session:
89+
The session to convert local pandas instances to BigFrames counter-parts.
90+
It is not used if the input itself is already a BigFrame data frame or series.
91+
92+
"""
93+
_validate_sessions(*input, session=session)
94+
95+
for frame in input:
96+
if isinstance(frame, bpd.DataFrame) or isinstance(frame, pd.DataFrame):
97+
yield convert.to_bf_dataframe(frame, default_index=None, session=session)
98+
elif isinstance(frame, bpd.Series) or isinstance(frame, pd.Series):
99+
yield convert.to_bf_series(
100+
_get_only_column(frame), default_index=None, session=session
101+
)
102+
else:
103+
raise ValueError(f"Unsupported type: {type(frame)}")
104+
105+
82106
def _validate_sessions(*input: ArrayType, session: Optional[Session]):
83107
session_ids = set(
84108
i._session.session_id

tests/system/small/ml/test_model_selection.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@
1313
# limitations under the License.
1414

1515
import math
16+
from typing import cast
1617

1718
import pandas as pd
1819
import pytest
1920

2021
from bigframes.ml import model_selection
2122
import bigframes.pandas as bpd
23+
import bigframes.session
2224

2325

2426
@pytest.mark.parametrize(
@@ -219,6 +221,78 @@ def test_train_test_split_seeded_correct_rows(
219221
)
220222

221223

224+
def test_train_test_split_no_shuffle_correct_shape(
225+
penguins_df_default_index: bpd.DataFrame,
226+
):
227+
X = penguins_df_default_index[["species"]]
228+
y = penguins_df_default_index["body_mass_g"]
229+
X_train, X_test, y_train, y_test = model_selection.train_test_split(
230+
X, y, shuffle=False
231+
)
232+
assert isinstance(X_train, bpd.DataFrame)
233+
assert isinstance(X_test, bpd.DataFrame)
234+
assert isinstance(y_train, bpd.Series)
235+
assert isinstance(y_test, bpd.Series)
236+
237+
assert X_train.shape == (258, 1)
238+
assert X_test.shape == (86, 1)
239+
assert y_train.shape == (258,)
240+
assert y_test.shape == (86,)
241+
242+
243+
def test_train_test_split_no_shuffle_correct_rows(
244+
session: bigframes.session.Session, penguins_pandas_df_default_index: bpd.DataFrame
245+
):
246+
# Note that we're using `penguins_pandas_df_default_index` as this test depends
247+
# on a stable row order being present end to end
248+
# filter down to the chunkiest penguins, to keep our test code a reasonable size
249+
all_data = penguins_pandas_df_default_index[
250+
penguins_pandas_df_default_index.body_mass_g > 5500
251+
].sort_index()
252+
253+
# Note that bigframes loses the index if it doesn't have a name
254+
all_data.index.name = "rowindex"
255+
256+
df = session.read_pandas(all_data)
257+
258+
X = df[
259+
[
260+
"species",
261+
"island",
262+
"culmen_length_mm",
263+
]
264+
]
265+
y = df["body_mass_g"]
266+
X_train, X_test, y_train, y_test = model_selection.train_test_split(
267+
X, y, shuffle=False
268+
)
269+
270+
X_train_pd = cast(bpd.DataFrame, X_train).to_pandas()
271+
X_test_pd = cast(bpd.DataFrame, X_test).to_pandas()
272+
y_train_pd = cast(bpd.Series, y_train).to_pandas()
273+
y_test_pd = cast(bpd.Series, y_test).to_pandas()
274+
275+
total_rows = len(all_data)
276+
train_size = 0.75
277+
train_rows = int(total_rows * train_size)
278+
test_rows = total_rows - train_rows
279+
280+
expected_X_train = all_data.head(train_rows)[
281+
["species", "island", "culmen_length_mm"]
282+
]
283+
expected_y_train = all_data.head(train_rows)["body_mass_g"]
284+
285+
expected_X_test = all_data.tail(test_rows)[
286+
["species", "island", "culmen_length_mm"]
287+
]
288+
expected_y_test = all_data.tail(test_rows)["body_mass_g"]
289+
290+
pd.testing.assert_frame_equal(X_train_pd, expected_X_train)
291+
pd.testing.assert_frame_equal(X_test_pd, expected_X_test)
292+
pd.testing.assert_series_equal(y_train_pd, expected_y_train)
293+
pd.testing.assert_series_equal(y_test_pd, expected_y_test)
294+
295+
222296
@pytest.mark.parametrize(
223297
("train_size", "test_size"),
224298
[

tests/system/small/test_dataframe.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1138,6 +1138,67 @@ def test_assign_new_column_w_setitem_list_error(scalars_dfs):
11381138
bf_df["new_col"] = [1, 2, 3]
11391139

11401140

1141+
@pytest.mark.parametrize(
1142+
("key", "value"),
1143+
[
1144+
pytest.param(["int64_col", "int64_too"], 1, id="scalar_to_existing_column"),
1145+
pytest.param(
1146+
["int64_col", "int64_too"], [1, 2], id="sequence_to_existing_column"
1147+
),
1148+
pytest.param(
1149+
["int64_col", "new_col"], [1, 2], id="sequence_to_partial_new_column"
1150+
),
1151+
pytest.param(
1152+
["new_col", "new_col_too"], [1, 2], id="sequence_to_full_new_column"
1153+
),
1154+
],
1155+
)
1156+
def test_setitem_multicolumn_with_literals(scalars_dfs, key, value):
1157+
scalars_df, scalars_pandas_df = scalars_dfs
1158+
bf_result = scalars_df.copy()
1159+
pd_result = scalars_pandas_df.copy()
1160+
1161+
bf_result[key] = value
1162+
pd_result[key] = value
1163+
1164+
pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False)
1165+
1166+
1167+
def test_setitem_multicolumn_with_literals_different_lengths_raise_error(scalars_dfs):
1168+
scalars_df, _ = scalars_dfs
1169+
bf_result = scalars_df.copy()
1170+
1171+
with pytest.raises(ValueError):
1172+
bf_result[["int64_col", "int64_too"]] = [1]
1173+
1174+
1175+
def test_setitem_multicolumn_with_dataframes(scalars_dfs):
1176+
scalars_df, scalars_pandas_df = scalars_dfs
1177+
bf_result = scalars_df.copy()
1178+
pd_result = scalars_pandas_df.copy()
1179+
1180+
bf_result[["int64_col", "int64_too"]] = bf_result[["int64_too", "int64_col"]] / 2
1181+
pd_result[["int64_col", "int64_too"]] = pd_result[["int64_too", "int64_col"]] / 2
1182+
1183+
pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False)
1184+
1185+
1186+
def test_setitem_multicolumn_with_dataframes_series_on_rhs_raise_error(scalars_dfs):
1187+
scalars_df, _ = scalars_dfs
1188+
bf_result = scalars_df.copy()
1189+
1190+
with pytest.raises(ValueError):
1191+
bf_result[["int64_col", "int64_too"]] = bf_result["int64_col"] / 2
1192+
1193+
1194+
def test_setitem_multicolumn_with_dataframes_different_lengths_raise_error(scalars_dfs):
1195+
scalars_df, _ = scalars_dfs
1196+
bf_result = scalars_df.copy()
1197+
1198+
with pytest.raises(ValueError):
1199+
bf_result[["int64_col"]] = bf_result[["int64_col", "int64_too"]] / 2
1200+
1201+
11411202
def test_assign_existing_column(scalars_dfs):
11421203
scalars_df, scalars_pandas_df = scalars_dfs
11431204
kwargs = {"int64_col": 2}

tests/unit/functions/test_remote_function_utils.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,78 @@ def test_package_existed_helper():
243243
assert not _utils._package_existed([], "pandas")
244244

245245

246+
def _function_add_one(x):
247+
return x + 1
248+
249+
250+
def _function_add_two(x):
251+
return x + 2
252+
253+
254+
@pytest.mark.parametrize(
255+
"func1, func2, should_be_equal, description",
256+
[
257+
(
258+
_function_add_one,
259+
_function_add_one,
260+
True,
261+
"Identical functions should have the same hash.",
262+
),
263+
(
264+
_function_add_one,
265+
_function_add_two,
266+
False,
267+
"Different functions should have different hashes.",
268+
),
269+
],
270+
)
271+
def test_get_hash_without_package_requirements(
272+
func1, func2, should_be_equal, description
273+
):
274+
"""Tests function hashes without any requirements."""
275+
hash1 = _utils.get_hash(func1)
276+
hash2 = _utils.get_hash(func2)
277+
278+
if should_be_equal:
279+
assert hash1 == hash2, f"FAILED: {description}"
280+
else:
281+
assert hash1 != hash2, f"FAILED: {description}"
282+
283+
284+
@pytest.mark.parametrize(
285+
"reqs1, reqs2, should_be_equal, description",
286+
[
287+
(
288+
None,
289+
["pandas>=1.0"],
290+
False,
291+
"Hash with or without requirements should differ from hash.",
292+
),
293+
(
294+
["pandas", "numpy", "scikit-learn"],
295+
["numpy", "scikit-learn", "pandas"],
296+
True,
297+
"Same requirements should produce the same hash.",
298+
),
299+
(
300+
["pandas==1.0"],
301+
["pandas==2.0"],
302+
False,
303+
"Different requirement versions should produce different hashes.",
304+
),
305+
],
306+
)
307+
def test_get_hash_with_package_requirements(reqs1, reqs2, should_be_equal, description):
308+
"""Tests how package requirements affect the final hash."""
309+
hash1 = _utils.get_hash(_function_add_one, package_requirements=reqs1)
310+
hash2 = _utils.get_hash(_function_add_one, package_requirements=reqs2)
311+
312+
if should_be_equal:
313+
assert hash1 == hash2, f"FAILED: {description}"
314+
else:
315+
assert hash1 != hash2, f"FAILED: {description}"
316+
317+
246318
# Helper functions for signature inspection tests
247319
def _func_one_arg_annotated(x: int) -> int:
248320
"""A function with one annotated arg and an annotated return type."""

0 commit comments

Comments
 (0)