feat: add parameter shuffle for ml.model_selection.train_test_split (#2030)

GarrettWu · web-flow · commit 2c72c56fb589 · 2025-08-27T13:52:13.000-05:00
* feat: add parameter shuffle for ml.model_selection.train_test_split

* mypy

* rename
diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py
@@ -18,6 +18,7 @@
 
 
 import inspect
+from itertools import chain
 import time
 from typing import cast, Generator, List, Optional, Union
 
@@ -36,12 +37,9 @@ def train_test_split(
     train_size: Union[float, None] = None,
     random_state: Union[int, None] = None,
     stratify: Union[bpd.Series, None] = None,
+    shuffle: bool = True,
 ) -> List[Union[bpd.DataFrame, bpd.Series]]:
 
-    # TODO(garrettwu): scikit-learn throws an error when the dataframes don't have the same
-    # number of rows. We probably want to do something similar. Now the implementation is based
-    # on index. We'll move to based on ordering first.
-
     if test_size is None:
         if train_size is None:
             test_size = 0.25
@@ -61,6 +59,26 @@ def train_test_split(
             f"The sum of train_size and test_size exceeds 1.0. train_size: {train_size}. test_size: {test_size}"
         )
 
+    if not shuffle:
+        if stratify is not None:
+            raise ValueError(
+                "Stratified train/test split is not implemented for shuffle=False"
+            )
+        bf_arrays = list(utils.batch_convert_to_bf_equivalent(*arrays))
+
+        total_rows = len(bf_arrays[0])
+        train_rows = int(total_rows * train_size)
+        test_rows = total_rows - train_rows
+
+        return list(
+            chain.from_iterable(
+                [
+                    [bf_array.head(train_rows), bf_array.tail(test_rows)]
+                    for bf_array in bf_arrays
+                ]
+            )
+        )
+
     dfs = list(utils.batch_convert_to_dataframe(*arrays))
 
     def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFrame]:
diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py
@@ -79,6 +79,30 @@ def batch_convert_to_series(
     )
 
 
+def batch_convert_to_bf_equivalent(
+    *input: ArrayType, session: Optional[Session] = None
+) -> Generator[Union[bpd.DataFrame, bpd.Series], None, None]:
+    """Converts the input to BigFrames DataFrame or Series.
+
+    Args:
+        session:
+            The session to convert local pandas instances to BigFrames counter-parts.
+            It is not used if the input itself is already a BigFrame data frame or series.
+
+    """
+    _validate_sessions(*input, session=session)
+
+    for frame in input:
+        if isinstance(frame, bpd.DataFrame) or isinstance(frame, pd.DataFrame):
+            yield convert.to_bf_dataframe(frame, default_index=None, session=session)
+        elif isinstance(frame, bpd.Series) or isinstance(frame, pd.Series):
+            yield convert.to_bf_series(
+                _get_only_column(frame), default_index=None, session=session
+            )
+        else:
+            raise ValueError(f"Unsupported type: {type(frame)}")
+
+
 def _validate_sessions(*input: ArrayType, session: Optional[Session]):
     session_ids = set(
         i._session.session_id
diff --git a/tests/system/small/ml/test_model_selection.py b/tests/system/small/ml/test_model_selection.py
@@ -13,12 +13,14 @@
 # limitations under the License.
 
 import math
+from typing import cast
 
 import pandas as pd
 import pytest
 
 from bigframes.ml import model_selection
 import bigframes.pandas as bpd
+import bigframes.session
 
 
 @pytest.mark.parametrize(
@@ -219,6 +221,78 @@ def test_train_test_split_seeded_correct_rows(
     )
 
 
+def test_train_test_split_no_shuffle_correct_shape(
+    penguins_df_default_index: bpd.DataFrame,
+):
+    X = penguins_df_default_index[["species"]]
+    y = penguins_df_default_index["body_mass_g"]
+    X_train, X_test, y_train, y_test = model_selection.train_test_split(
+        X, y, shuffle=False
+    )
+    assert isinstance(X_train, bpd.DataFrame)
+    assert isinstance(X_test, bpd.DataFrame)
+    assert isinstance(y_train, bpd.Series)
+    assert isinstance(y_test, bpd.Series)
+
+    assert X_train.shape == (258, 1)
+    assert X_test.shape == (86, 1)
+    assert y_train.shape == (258,)
+    assert y_test.shape == (86,)
+
+
+def test_train_test_split_no_shuffle_correct_rows(
+    session: bigframes.session.Session, penguins_pandas_df_default_index: bpd.DataFrame
+):
+    # Note that we're using `penguins_pandas_df_default_index` as this test depends
+    # on a stable row order being present end to end
+    # filter down to the chunkiest penguins, to keep our test code a reasonable size
+    all_data = penguins_pandas_df_default_index[
+        penguins_pandas_df_default_index.body_mass_g > 5500
+    ].sort_index()
+
+    # Note that bigframes loses the index if it doesn't have a name
+    all_data.index.name = "rowindex"
+
+    df = session.read_pandas(all_data)
+
+    X = df[
+        [
+            "species",
+            "island",
+            "culmen_length_mm",
+        ]
+    ]
+    y = df["body_mass_g"]
+    X_train, X_test, y_train, y_test = model_selection.train_test_split(
+        X, y, shuffle=False
+    )
+
+    X_train_pd = cast(bpd.DataFrame, X_train).to_pandas()
+    X_test_pd = cast(bpd.DataFrame, X_test).to_pandas()
+    y_train_pd = cast(bpd.Series, y_train).to_pandas()
+    y_test_pd = cast(bpd.Series, y_test).to_pandas()
+
+    total_rows = len(all_data)
+    train_size = 0.75
+    train_rows = int(total_rows * train_size)
+    test_rows = total_rows - train_rows
+
+    expected_X_train = all_data.head(train_rows)[
+        ["species", "island", "culmen_length_mm"]
+    ]
+    expected_y_train = all_data.head(train_rows)["body_mass_g"]
+
+    expected_X_test = all_data.tail(test_rows)[
+        ["species", "island", "culmen_length_mm"]
+    ]
+    expected_y_test = all_data.tail(test_rows)["body_mass_g"]
+
+    pd.testing.assert_frame_equal(X_train_pd, expected_X_train)
+    pd.testing.assert_frame_equal(X_test_pd, expected_X_test)
+    pd.testing.assert_series_equal(y_train_pd, expected_y_train)
+    pd.testing.assert_series_equal(y_test_pd, expected_y_test)
+
+
 @pytest.mark.parametrize(
     ("train_size", "test_size"),
     [