diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index abb4b0f26c..ca089bb551 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -18,6 +18,7 @@ import inspect +from itertools import chain import time from typing import cast, Generator, List, Optional, Union @@ -36,12 +37,9 @@ def train_test_split( train_size: Union[float, None] = None, random_state: Union[int, None] = None, stratify: Union[bpd.Series, None] = None, + shuffle: bool = True, ) -> List[Union[bpd.DataFrame, bpd.Series]]: - # TODO(garrettwu): scikit-learn throws an error when the dataframes don't have the same - # number of rows. We probably want to do something similar. Now the implementation is based - # on index. We'll move to based on ordering first. - if test_size is None: if train_size is None: test_size = 0.25 @@ -61,6 +59,26 @@ def train_test_split( f"The sum of train_size and test_size exceeds 1.0. train_size: {train_size}. test_size: {test_size}" ) + if not shuffle: + if stratify is not None: + raise ValueError( + "Stratified train/test split is not implemented for shuffle=False" + ) + bf_arrays = list(utils.batch_convert_to_bf_equivalent(*arrays)) + + total_rows = len(bf_arrays[0]) + train_rows = int(total_rows * train_size) + test_rows = total_rows - train_rows + + return list( + chain.from_iterable( + [ + [bf_array.head(train_rows), bf_array.tail(test_rows)] + for bf_array in bf_arrays + ] + ) + ) + dfs = list(utils.batch_convert_to_dataframe(*arrays)) def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFrame]: diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py index 5c02789576..80630c4f81 100644 --- a/bigframes/ml/utils.py +++ b/bigframes/ml/utils.py @@ -79,6 +79,30 @@ def batch_convert_to_series( ) +def batch_convert_to_bf_equivalent( + *input: ArrayType, session: Optional[Session] = None +) -> Generator[Union[bpd.DataFrame, bpd.Series], None, None]: + """Converts the input to BigFrames DataFrame or Series. + + Args: + session: + The session to convert local pandas instances to BigFrames counter-parts. + It is not used if the input itself is already a BigFrame data frame or series. + + """ + _validate_sessions(*input, session=session) + + for frame in input: + if isinstance(frame, bpd.DataFrame) or isinstance(frame, pd.DataFrame): + yield convert.to_bf_dataframe(frame, default_index=None, session=session) + elif isinstance(frame, bpd.Series) or isinstance(frame, pd.Series): + yield convert.to_bf_series( + _get_only_column(frame), default_index=None, session=session + ) + else: + raise ValueError(f"Unsupported type: {type(frame)}") + + def _validate_sessions(*input: ArrayType, session: Optional[Session]): session_ids = set( i._session.session_id diff --git a/tests/system/small/ml/test_model_selection.py b/tests/system/small/ml/test_model_selection.py index c1a1e073b9..ebce6e405a 100644 --- a/tests/system/small/ml/test_model_selection.py +++ b/tests/system/small/ml/test_model_selection.py @@ -13,12 +13,14 @@ # limitations under the License. import math +from typing import cast import pandas as pd import pytest from bigframes.ml import model_selection import bigframes.pandas as bpd +import bigframes.session @pytest.mark.parametrize( @@ -219,6 +221,78 @@ def test_train_test_split_seeded_correct_rows( ) +def test_train_test_split_no_shuffle_correct_shape( + penguins_df_default_index: bpd.DataFrame, +): + X = penguins_df_default_index[["species"]] + y = penguins_df_default_index["body_mass_g"] + X_train, X_test, y_train, y_test = model_selection.train_test_split( + X, y, shuffle=False + ) + assert isinstance(X_train, bpd.DataFrame) + assert isinstance(X_test, bpd.DataFrame) + assert isinstance(y_train, bpd.Series) + assert isinstance(y_test, bpd.Series) + + assert X_train.shape == (258, 1) + assert X_test.shape == (86, 1) + assert y_train.shape == (258,) + assert y_test.shape == (86,) + + +def test_train_test_split_no_shuffle_correct_rows( + session: bigframes.session.Session, penguins_pandas_df_default_index: bpd.DataFrame +): + # Note that we're using `penguins_pandas_df_default_index` as this test depends + # on a stable row order being present end to end + # filter down to the chunkiest penguins, to keep our test code a reasonable size + all_data = penguins_pandas_df_default_index[ + penguins_pandas_df_default_index.body_mass_g > 5500 + ].sort_index() + + # Note that bigframes loses the index if it doesn't have a name + all_data.index.name = "rowindex" + + df = session.read_pandas(all_data) + + X = df[ + [ + "species", + "island", + "culmen_length_mm", + ] + ] + y = df["body_mass_g"] + X_train, X_test, y_train, y_test = model_selection.train_test_split( + X, y, shuffle=False + ) + + X_train_pd = cast(bpd.DataFrame, X_train).to_pandas() + X_test_pd = cast(bpd.DataFrame, X_test).to_pandas() + y_train_pd = cast(bpd.Series, y_train).to_pandas() + y_test_pd = cast(bpd.Series, y_test).to_pandas() + + total_rows = len(all_data) + train_size = 0.75 + train_rows = int(total_rows * train_size) + test_rows = total_rows - train_rows + + expected_X_train = all_data.head(train_rows)[ + ["species", "island", "culmen_length_mm"] + ] + expected_y_train = all_data.head(train_rows)["body_mass_g"] + + expected_X_test = all_data.tail(test_rows)[ + ["species", "island", "culmen_length_mm"] + ] + expected_y_test = all_data.tail(test_rows)["body_mass_g"] + + pd.testing.assert_frame_equal(X_train_pd, expected_X_train) + pd.testing.assert_frame_equal(X_test_pd, expected_X_test) + pd.testing.assert_series_equal(y_train_pd, expected_y_train) + pd.testing.assert_series_equal(y_test_pd, expected_y_test) + + @pytest.mark.parametrize( ("train_size", "test_size"), [