Skip to content

Commit 7171d21

Browse files
fix: bigframes.ml fit with eval data in partial mode avoids join on null index (#2355)
Fix ML fit ordering issue with partial mode and eval data. --- *PR created automatically by Jules for task [4750522966926378079](https://jules.google.com/task/4750522966926378079) started by @tswast* --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
1 parent b8f0901 commit 7171d21

File tree

2 files changed

+32
-5
lines changed

2 files changed

+32
-5
lines changed

bigframes/ml/utils.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,10 +201,28 @@ def combine_training_and_evaluation_data(
201201
split_col = guid.generate_guid()
202202
assert split_col not in X_train.columns
203203

204+
# To prevent side effects on the input dataframes, we operate on copies
205+
X_train = X_train.copy()
206+
X_eval = X_eval.copy()
207+
204208
X_train[split_col] = False
205209
X_eval[split_col] = True
206-
X = bpd.concat([X_train, X_eval])
207-
y = bpd.concat([y_train, y_eval])
210+
211+
# Rename y columns to avoid collision with X columns during join
212+
y_mapping = {col: guid.generate_guid() + str(col) for col in y_train.columns}
213+
y_train_renamed = y_train.rename(columns=y_mapping)
214+
y_eval_renamed = y_eval.rename(columns=y_mapping)
215+
216+
# Join X and y first to preserve row alignment
217+
train_combined = X_train.join(y_train_renamed, how="outer")
218+
eval_combined = X_eval.join(y_eval_renamed, how="outer")
219+
220+
combined = bpd.concat([train_combined, eval_combined])
221+
222+
X = combined[X_train.columns]
223+
y = combined[list(y_mapping.values())].rename(
224+
columns={v: k for k, v in y_mapping.items()}
225+
)
208226

209227
# create options copy to not mutate the incoming one
210228
bqml_options = bqml_options.copy()

tests/system/large/ml/test_linear_model.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import pandas as pd
16+
import pytest
1617

1718
from bigframes.ml import model_selection
1819
import bigframes.ml.linear_model
@@ -61,12 +62,20 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase
6162
assert reloaded_model.tol == 0.01
6263

6364

65+
@pytest.mark.parametrize(
66+
"df_fixture",
67+
[
68+
"penguins_df_default_index",
69+
"penguins_df_null_index",
70+
],
71+
)
6472
def test_linear_regression_configure_fit_with_eval_score(
65-
penguins_df_default_index, dataset_id
73+
df_fixture, dataset_id, request
6674
):
75+
df = request.getfixturevalue(df_fixture)
6776
model = bigframes.ml.linear_model.LinearRegression()
6877

69-
df = penguins_df_default_index.dropna()
78+
df = df.dropna()
7079
X = df[
7180
[
7281
"species",
@@ -109,7 +118,7 @@ def test_linear_regression_configure_fit_with_eval_score(
109118
assert reloaded_model.tol == 0.01
110119

111120
# make sure the bqml model was internally created with custom split
112-
bq_model = penguins_df_default_index._session.bqclient.get_model(bq_model_name)
121+
bq_model = df._session.bqclient.get_model(bq_model_name)
113122
last_fitting = bq_model.training_runs[-1]["trainingOptions"]
114123
assert last_fitting["dataSplitMethod"] == "CUSTOM"
115124
assert "dataSplitColumn" in last_fitting

0 commit comments

Comments
 (0)