Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions bigframes/ml/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@


import inspect
from itertools import chain
import time
from typing import cast, Generator, List, Optional, Union

Expand All @@ -36,12 +37,9 @@ def train_test_split(
train_size: Union[float, None] = None,
random_state: Union[int, None] = None,
stratify: Union[bpd.Series, None] = None,
shuffle: bool = True,
) -> List[Union[bpd.DataFrame, bpd.Series]]:

# TODO(garrettwu): scikit-learn throws an error when the dataframes don't have the same
# number of rows. We probably want to do something similar. Now the implementation is based
# on index. We'll move to based on ordering first.

if test_size is None:
if train_size is None:
test_size = 0.25
Expand All @@ -61,6 +59,26 @@ def train_test_split(
f"The sum of train_size and test_size exceeds 1.0. train_size: {train_size}. test_size: {test_size}"
)

if not shuffle:
if stratify is not None:
raise ValueError(
"Stratified train/test split is not implemented for shuffle=False"
)
bf_arrays = list(utils.batch_convert_to_bf_array_type(*arrays))

total_rows = len(bf_arrays[0])
train_rows = int(total_rows * train_size)
test_rows = total_rows - train_rows

return list(
chain.from_iterable(
[
[bf_array.head(train_rows), bf_array.tail(test_rows)]
for bf_array in bf_arrays
]
)
)

dfs = list(utils.batch_convert_to_dataframe(*arrays))

def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFrame]:
Expand Down
24 changes: 24 additions & 0 deletions bigframes/ml/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,30 @@ def batch_convert_to_series(
)


def batch_convert_to_bf_array_type(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

naming nit: "batch_convert_to_bf_equivalent"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

*input: ArrayType, session: Optional[Session] = None
) -> Generator[Union[bpd.DataFrame, bpd.Series], None, None]:
"""Converts the input to BigFrames DataFrame or Series.

Args:
session:
The session to convert local pandas instances to BigFrames counter-parts.
It is not used if the input itself is already a BigFrame data frame or series.

"""
_validate_sessions(*input, session=session)

for frame in input:
if isinstance(frame, bpd.DataFrame) or isinstance(frame, pd.DataFrame):
yield convert.to_bf_dataframe(frame, default_index=None, session=session)
elif isinstance(frame, bpd.Series) or isinstance(frame, pd.Series):
yield convert.to_bf_series(
_get_only_column(frame), default_index=None, session=session
)
else:
raise ValueError(f"Unsupported type: {type(frame)}")


def _validate_sessions(*input: ArrayType, session: Optional[Session]):
session_ids = set(
i._session.session_id
Expand Down
74 changes: 74 additions & 0 deletions tests/system/small/ml/test_model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@
# limitations under the License.

import math
from typing import cast

import pandas as pd
import pytest

from bigframes.ml import model_selection
import bigframes.pandas as bpd
import bigframes.session


@pytest.mark.parametrize(
Expand Down Expand Up @@ -219,6 +221,78 @@ def test_train_test_split_seeded_correct_rows(
)


def test_train_test_split_no_shuffle_correct_shape(
penguins_df_default_index: bpd.DataFrame,
):
X = penguins_df_default_index[["species"]]
y = penguins_df_default_index["body_mass_g"]
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X, y, shuffle=False
)
assert isinstance(X_train, bpd.DataFrame)
assert isinstance(X_test, bpd.DataFrame)
assert isinstance(y_train, bpd.Series)
assert isinstance(y_test, bpd.Series)

assert X_train.shape == (258, 1)
assert X_test.shape == (86, 1)
assert y_train.shape == (258,)
assert y_test.shape == (86,)


def test_train_test_split_no_shuffle_correct_rows(
session: bigframes.session.Session, penguins_pandas_df_default_index: bpd.DataFrame
):
# Note that we're using `penguins_pandas_df_default_index` as this test depends
# on a stable row order being present end to end
# filter down to the chunkiest penguins, to keep our test code a reasonable size
all_data = penguins_pandas_df_default_index[
penguins_pandas_df_default_index.body_mass_g > 5500
].sort_index()

# Note that bigframes loses the index if it doesn't have a name
all_data.index.name = "rowindex"

df = session.read_pandas(all_data)

X = df[
[
"species",
"island",
"culmen_length_mm",
]
]
y = df["body_mass_g"]
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X, y, shuffle=False
)

X_train_pd = cast(bpd.DataFrame, X_train).to_pandas()
X_test_pd = cast(bpd.DataFrame, X_test).to_pandas()
y_train_pd = cast(bpd.Series, y_train).to_pandas()
y_test_pd = cast(bpd.Series, y_test).to_pandas()

total_rows = len(all_data)
train_size = 0.75
train_rows = int(total_rows * train_size)
test_rows = total_rows - train_rows

expected_X_train = all_data.head(train_rows)[
["species", "island", "culmen_length_mm"]
]
expected_y_train = all_data.head(train_rows)["body_mass_g"]

expected_X_test = all_data.tail(test_rows)[
["species", "island", "culmen_length_mm"]
]
expected_y_test = all_data.tail(test_rows)["body_mass_g"]

pd.testing.assert_frame_equal(X_train_pd, expected_X_train)
pd.testing.assert_frame_equal(X_test_pd, expected_X_test)
pd.testing.assert_series_equal(y_train_pd, expected_y_train)
pd.testing.assert_series_equal(y_test_pd, expected_y_test)


@pytest.mark.parametrize(
("train_size", "test_size"),
[
Expand Down