Merge branch 'main' into output_schema

ccarpentiere · web-flow · commit 47768720b259 · 2025-08-27T13:39:34.000-07:00
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -26,6 +26,7 @@
 import traceback
 import typing
 from typing import (
+    Any,
     Callable,
     Dict,
     Hashable,
@@ -91,6 +92,7 @@
     import bigframes.session
 
     SingleItemValue = Union[bigframes.series.Series, int, float, str, Callable]
+    MultiItemValue = Union["DataFrame", Sequence[int | float | str | Callable]]
 
 LevelType = typing.Hashable
 LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]
@@ -884,8 +886,13 @@ def __delitem__(self, key: str):
         df = self.drop(columns=[key])
         self._set_block(df._get_block())
 
-    def __setitem__(self, key: str, value: SingleItemValue):
-        df = self._assign_single_item(key, value)
+    def __setitem__(
+        self, key: str | list[str], value: SingleItemValue | MultiItemValue
+    ):
+        if isinstance(key, list):
+            df = self._assign_multi_items(key, value)
+        else:
+            df = self._assign_single_item(key, value)
         self._set_block(df._get_block())
 
     __setitem__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__setitem__)
@@ -2212,7 +2219,7 @@ def assign(self, **kwargs) -> DataFrame:
     def _assign_single_item(
         self,
         k: str,
-        v: SingleItemValue,
+        v: SingleItemValue | MultiItemValue,
     ) -> DataFrame:
         if isinstance(v, bigframes.series.Series):
             return self._assign_series_join_on_index(k, v)
@@ -2230,7 +2237,33 @@ def _assign_single_item(
         elif utils.is_list_like(v):
             return self._assign_single_item_listlike(k, v)
         else:
-            return self._assign_scalar(k, v)
+            return self._assign_scalar(k, v)  # type: ignore
+
+    def _assign_multi_items(
+        self,
+        k: list[str],
+        v: SingleItemValue | MultiItemValue,
+    ) -> DataFrame:
+        value_sources: Sequence[Any] = []
+        if isinstance(v, DataFrame):
+            value_sources = [v[col] for col in v.columns]
+        elif isinstance(v, bigframes.series.Series):
+            # For behavior consistency with Pandas.
+            raise ValueError("Columns must be same length as key")
+        elif isinstance(v, Sequence):
+            value_sources = v
+        else:
+            # We assign the same scalar value to all target columns.
+            value_sources = [v] * len(k)
+
+        if len(value_sources) != len(k):
+            raise ValueError("Columns must be same length as key")
+
+        # Repeatedly assign columns in order.
+        result = self._assign_single_item(k[0], value_sources[0])
+        for target, source in zip(k[1:], value_sources[1:]):
+            result = result._assign_single_item(target, source)
+        return result
 
     def _assign_single_item_listlike(self, k: str, v: Sequence) -> DataFrame:
         given_rows = len(v)
diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py
@@ -18,6 +18,7 @@
 
 
 import inspect
+from itertools import chain
 import time
 from typing import cast, Generator, List, Optional, Union
 
@@ -36,12 +37,9 @@ def train_test_split(
     train_size: Union[float, None] = None,
     random_state: Union[int, None] = None,
     stratify: Union[bpd.Series, None] = None,
+    shuffle: bool = True,
 ) -> List[Union[bpd.DataFrame, bpd.Series]]:
 
-    # TODO(garrettwu): scikit-learn throws an error when the dataframes don't have the same
-    # number of rows. We probably want to do something similar. Now the implementation is based
-    # on index. We'll move to based on ordering first.
-
     if test_size is None:
         if train_size is None:
             test_size = 0.25
@@ -61,6 +59,26 @@ def train_test_split(
             f"The sum of train_size and test_size exceeds 1.0. train_size: {train_size}. test_size: {test_size}"
         )
 
+    if not shuffle:
+        if stratify is not None:
+            raise ValueError(
+                "Stratified train/test split is not implemented for shuffle=False"
+            )
+        bf_arrays = list(utils.batch_convert_to_bf_equivalent(*arrays))
+
+        total_rows = len(bf_arrays[0])
+        train_rows = int(total_rows * train_size)
+        test_rows = total_rows - train_rows
+
+        return list(
+            chain.from_iterable(
+                [
+                    [bf_array.head(train_rows), bf_array.tail(test_rows)]
+                    for bf_array in bf_arrays
+                ]
+            )
+        )
+
     dfs = list(utils.batch_convert_to_dataframe(*arrays))
 
     def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFrame]:
diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py
@@ -79,6 +79,30 @@ def batch_convert_to_series(
     )
 
 
+def batch_convert_to_bf_equivalent(
+    *input: ArrayType, session: Optional[Session] = None
+) -> Generator[Union[bpd.DataFrame, bpd.Series], None, None]:
+    """Converts the input to BigFrames DataFrame or Series.
+
+    Args:
+        session:
+            The session to convert local pandas instances to BigFrames counter-parts.
+            It is not used if the input itself is already a BigFrame data frame or series.
+
+    """
+    _validate_sessions(*input, session=session)
+
+    for frame in input:
+        if isinstance(frame, bpd.DataFrame) or isinstance(frame, pd.DataFrame):
+            yield convert.to_bf_dataframe(frame, default_index=None, session=session)
+        elif isinstance(frame, bpd.Series) or isinstance(frame, pd.Series):
+            yield convert.to_bf_series(
+                _get_only_column(frame), default_index=None, session=session
+            )
+        else:
+            raise ValueError(f"Unsupported type: {type(frame)}")
+
+
 def _validate_sessions(*input: ArrayType, session: Optional[Session]):
     session_ids = set(
         i._session.session_id
diff --git a/tests/system/small/ml/test_model_selection.py b/tests/system/small/ml/test_model_selection.py
@@ -13,12 +13,14 @@
 # limitations under the License.
 
 import math
+from typing import cast
 
 import pandas as pd
 import pytest
 
 from bigframes.ml import model_selection
 import bigframes.pandas as bpd
+import bigframes.session
 
 
 @pytest.mark.parametrize(
@@ -219,6 +221,78 @@ def test_train_test_split_seeded_correct_rows(
     )
 
 
+def test_train_test_split_no_shuffle_correct_shape(
+    penguins_df_default_index: bpd.DataFrame,
+):
+    X = penguins_df_default_index[["species"]]
+    y = penguins_df_default_index["body_mass_g"]
+    X_train, X_test, y_train, y_test = model_selection.train_test_split(
+        X, y, shuffle=False
+    )
+    assert isinstance(X_train, bpd.DataFrame)
+    assert isinstance(X_test, bpd.DataFrame)
+    assert isinstance(y_train, bpd.Series)
+    assert isinstance(y_test, bpd.Series)
+
+    assert X_train.shape == (258, 1)
+    assert X_test.shape == (86, 1)
+    assert y_train.shape == (258,)
+    assert y_test.shape == (86,)
+
+
+def test_train_test_split_no_shuffle_correct_rows(
+    session: bigframes.session.Session, penguins_pandas_df_default_index: bpd.DataFrame
+):
+    # Note that we're using `penguins_pandas_df_default_index` as this test depends
+    # on a stable row order being present end to end
+    # filter down to the chunkiest penguins, to keep our test code a reasonable size
+    all_data = penguins_pandas_df_default_index[
+        penguins_pandas_df_default_index.body_mass_g > 5500
+    ].sort_index()
+
+    # Note that bigframes loses the index if it doesn't have a name
+    all_data.index.name = "rowindex"
+
+    df = session.read_pandas(all_data)
+
+    X = df[
+        [
+            "species",
+            "island",
+            "culmen_length_mm",
+        ]
+    ]
+    y = df["body_mass_g"]
+    X_train, X_test, y_train, y_test = model_selection.train_test_split(
+        X, y, shuffle=False
+    )
+
+    X_train_pd = cast(bpd.DataFrame, X_train).to_pandas()
+    X_test_pd = cast(bpd.DataFrame, X_test).to_pandas()
+    y_train_pd = cast(bpd.Series, y_train).to_pandas()
+    y_test_pd = cast(bpd.Series, y_test).to_pandas()
+
+    total_rows = len(all_data)
+    train_size = 0.75
+    train_rows = int(total_rows * train_size)
+    test_rows = total_rows - train_rows
+
+    expected_X_train = all_data.head(train_rows)[
+        ["species", "island", "culmen_length_mm"]
+    ]
+    expected_y_train = all_data.head(train_rows)["body_mass_g"]
+
+    expected_X_test = all_data.tail(test_rows)[
+        ["species", "island", "culmen_length_mm"]
+    ]
+    expected_y_test = all_data.tail(test_rows)["body_mass_g"]
+
+    pd.testing.assert_frame_equal(X_train_pd, expected_X_train)
+    pd.testing.assert_frame_equal(X_test_pd, expected_X_test)
+    pd.testing.assert_series_equal(y_train_pd, expected_y_train)
+    pd.testing.assert_series_equal(y_test_pd, expected_y_test)
+
+
 @pytest.mark.parametrize(
     ("train_size", "test_size"),
     [
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -1138,6 +1138,67 @@ def test_assign_new_column_w_setitem_list_error(scalars_dfs):
         bf_df["new_col"] = [1, 2, 3]
 
 
+@pytest.mark.parametrize(
+    ("key", "value"),
+    [
+        pytest.param(["int64_col", "int64_too"], 1, id="scalar_to_existing_column"),
+        pytest.param(
+            ["int64_col", "int64_too"], [1, 2], id="sequence_to_existing_column"
+        ),
+        pytest.param(
+            ["int64_col", "new_col"], [1, 2], id="sequence_to_partial_new_column"
+        ),
+        pytest.param(
+            ["new_col", "new_col_too"], [1, 2], id="sequence_to_full_new_column"
+        ),
+    ],
+)
+def test_setitem_multicolumn_with_literals(scalars_dfs, key, value):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = scalars_df.copy()
+    pd_result = scalars_pandas_df.copy()
+
+    bf_result[key] = value
+    pd_result[key] = value
+
+    pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False)
+
+
+def test_setitem_multicolumn_with_literals_different_lengths_raise_error(scalars_dfs):
+    scalars_df, _ = scalars_dfs
+    bf_result = scalars_df.copy()
+
+    with pytest.raises(ValueError):
+        bf_result[["int64_col", "int64_too"]] = [1]
+
+
+def test_setitem_multicolumn_with_dataframes(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = scalars_df.copy()
+    pd_result = scalars_pandas_df.copy()
+
+    bf_result[["int64_col", "int64_too"]] = bf_result[["int64_too", "int64_col"]] / 2
+    pd_result[["int64_col", "int64_too"]] = pd_result[["int64_too", "int64_col"]] / 2
+
+    pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False)
+
+
+def test_setitem_multicolumn_with_dataframes_series_on_rhs_raise_error(scalars_dfs):
+    scalars_df, _ = scalars_dfs
+    bf_result = scalars_df.copy()
+
+    with pytest.raises(ValueError):
+        bf_result[["int64_col", "int64_too"]] = bf_result["int64_col"] / 2
+
+
+def test_setitem_multicolumn_with_dataframes_different_lengths_raise_error(scalars_dfs):
+    scalars_df, _ = scalars_dfs
+    bf_result = scalars_df.copy()
+
+    with pytest.raises(ValueError):
+        bf_result[["int64_col"]] = bf_result[["int64_col", "int64_too"]] / 2
+
+
 def test_assign_existing_column(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     kwargs = {"int64_col": 2}
diff --git a/tests/unit/functions/test_remote_function_utils.py b/tests/unit/functions/test_remote_function_utils.py
@@ -243,6 +243,78 @@ def test_package_existed_helper():
     assert not _utils._package_existed([], "pandas")
 
 
+def _function_add_one(x):
+    return x + 1
+
+
+def _function_add_two(x):
+    return x + 2
+
+
+@pytest.mark.parametrize(
+    "func1, func2, should_be_equal, description",
+    [
+        (
+            _function_add_one,
+            _function_add_one,
+            True,
+            "Identical functions should have the same hash.",
+        ),
+        (
+            _function_add_one,
+            _function_add_two,
+            False,
+            "Different functions should have different hashes.",
+        ),
+    ],
+)
+def test_get_hash_without_package_requirements(
+    func1, func2, should_be_equal, description
+):
+    """Tests function hashes without any requirements."""
+    hash1 = _utils.get_hash(func1)
+    hash2 = _utils.get_hash(func2)
+
+    if should_be_equal:
+        assert hash1 == hash2, f"FAILED: {description}"
+    else:
+        assert hash1 != hash2, f"FAILED: {description}"
+
+
+@pytest.mark.parametrize(
+    "reqs1, reqs2, should_be_equal, description",
+    [
+        (
+            None,
+            ["pandas>=1.0"],
+            False,
+            "Hash with or without requirements should differ from hash.",
+        ),
+        (
+            ["pandas", "numpy", "scikit-learn"],
+            ["numpy", "scikit-learn", "pandas"],
+            True,
+            "Same requirements should produce the same hash.",
+        ),
+        (
+            ["pandas==1.0"],
+            ["pandas==2.0"],
+            False,
+            "Different requirement versions should produce different hashes.",
+        ),
+    ],
+)
+def test_get_hash_with_package_requirements(reqs1, reqs2, should_be_equal, description):
+    """Tests how package requirements affect the final hash."""
+    hash1 = _utils.get_hash(_function_add_one, package_requirements=reqs1)
+    hash2 = _utils.get_hash(_function_add_one, package_requirements=reqs2)
+
+    if should_be_equal:
+        assert hash1 == hash2, f"FAILED: {description}"
+    else:
+        assert hash1 != hash2, f"FAILED: {description}"
+
+
 # Helper functions for signature inspection tests
 def _func_one_arg_annotated(x: int) -> int:
     """A function with one annotated arg and an annotated return type."""
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py