fix: bigframes.ml fit with eval data in partial mode avoids join on null index (#2355)

google-labs-jules[bot] · web-flow · commit 7171d21b8c8d · 2025-12-23T21:42:09.000Z
Fix ML fit ordering issue with partial mode and eval data. --- *PR created automatically by Jules for task [4750522966926378079](https://jules.google.com/task/4750522966926378079) started by @tswast* --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py
@@ -201,10 +201,28 @@ def combine_training_and_evaluation_data(
     split_col = guid.generate_guid()
     assert split_col not in X_train.columns
 
+    # To prevent side effects on the input dataframes, we operate on copies
+    X_train = X_train.copy()
+    X_eval = X_eval.copy()
+
     X_train[split_col] = False
     X_eval[split_col] = True
-    X = bpd.concat([X_train, X_eval])
-    y = bpd.concat([y_train, y_eval])
+
+    # Rename y columns to avoid collision with X columns during join
+    y_mapping = {col: guid.generate_guid() + str(col) for col in y_train.columns}
+    y_train_renamed = y_train.rename(columns=y_mapping)
+    y_eval_renamed = y_eval.rename(columns=y_mapping)
+
+    # Join X and y first to preserve row alignment
+    train_combined = X_train.join(y_train_renamed, how="outer")
+    eval_combined = X_eval.join(y_eval_renamed, how="outer")
+
+    combined = bpd.concat([train_combined, eval_combined])
+
+    X = combined[X_train.columns]
+    y = combined[list(y_mapping.values())].rename(
+        columns={v: k for k, v in y_mapping.items()}
+    )
 
     # create options copy to not mutate the incoming one
     bqml_options = bqml_options.copy()
diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import pandas as pd
+import pytest
 
 from bigframes.ml import model_selection
 import bigframes.ml.linear_model
@@ -61,12 +62,20 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase
     assert reloaded_model.tol == 0.01
 
 
+@pytest.mark.parametrize(
+    "df_fixture",
+    [
+        "penguins_df_default_index",
+        "penguins_df_null_index",
+    ],
+)
 def test_linear_regression_configure_fit_with_eval_score(
-    penguins_df_default_index, dataset_id
+    df_fixture, dataset_id, request
 ):
+    df = request.getfixturevalue(df_fixture)
     model = bigframes.ml.linear_model.LinearRegression()
 
-    df = penguins_df_default_index.dropna()
+    df = df.dropna()
     X = df[
         [
             "species",
@@ -109,7 +118,7 @@ def test_linear_regression_configure_fit_with_eval_score(
     assert reloaded_model.tol == 0.01
 
     # make sure the bqml model was internally created with custom split
-    bq_model = penguins_df_default_index._session.bqclient.get_model(bq_model_name)
+    bq_model = df._session.bqclient.get_model(bq_model_name)
     last_fitting = bq_model.training_runs[-1]["trainingOptions"]
     assert last_fitting["dataSplitMethod"] == "CUSTOM"
     assert "dataSplitColumn" in last_fitting