feat: Allow DataFrame.join for self-join on Null index (#860)

TrevorBergeron · web-flow · commit e95053372c36 · 2024-07-30T12:33:14.000-05:00
* feat: Allow DataFrame.join for self-join on Null index

* fix ml caching to apply post-join, add test

* fix ml golden sql test

* change unordered test to use linear regression
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -2307,11 +2307,11 @@ def join(
                 f"Only how='outer','left','right','inner' currently supported. {constants.FEEDBACK_LINK}"
             )
         # Handle null index, which only supports row join
-        if (self.index.nlevels == other.index.nlevels == 0) and not block_identity_join:
-            if not block_identity_join:
-                result = try_row_join(self, other, how=how)
-                if result is not None:
-                    return result
+        # This is the canonical way of aligning on null index, so always allow (ignore block_identity_join)
+        if self.index.nlevels == other.index.nlevels == 0:
+            result = try_row_join(self, other, how=how)
+            if result is not None:
+                return result
             raise bigframes.exceptions.NullIndexError(
                 "Cannot implicitly align objects. Set an explicit index using set_index."
             )
diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
@@ -83,7 +83,7 @@ def distance(
         """
         assert len(x.columns) == 1 and len(y.columns) == 1
 
-        input_data = x.cache().join(y.cache(), how="outer")
+        input_data = x.join(y, how="outer").cache()
         x_column_id, y_column_id = x._block.value_columns[0], y._block.value_columns[0]
 
         return self._apply_sql(
@@ -326,7 +326,7 @@ def create_model(
         if y_train is None:
             input_data = X_train.cache()
         else:
-            input_data = X_train.cache().join(y_train.cache(), how="outer")
+            input_data = X_train.join(y_train, how="outer").cache()
             options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()})
 
         session = X_train._session
@@ -366,7 +366,7 @@ def create_llm_remote_model(
         options = dict(options)
         # Cache dataframes to make sure base table is not a snapshot
         # cached dataframe creates a full copy, never uses snapshot
-        input_data = X_train.cache().join(y_train.cache(), how="outer")
+        input_data = X_train.join(y_train, how="outer").cache()
         options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()})
 
         session = X_train._session
@@ -399,7 +399,7 @@ def create_time_series_model(
         options = dict(options)
         # Cache dataframes to make sure base table is not a snapshot
         # cached dataframe creates a full copy, never uses snapshot
-        input_data = X_train.cache().join(y_train.cache(), how="outer")
+        input_data = X_train.join(y_train, how="outer").cache()
         options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]})
         options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]})
 
diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py
@@ -111,6 +111,50 @@ def test_linear_regression_customized_params_fit_score(
     assert reloaded_model.learning_rate == 0.2
 
 
+def test_unordered_mode_regression_configure_fit_score(
+    unordered_session, penguins_table_id, dataset_id
+):
+    model = bigframes.ml.linear_model.LinearRegression()
+
+    df = unordered_session.read_gbq(penguins_table_id).dropna()
+    X_train = df[
+        [
+            "species",
+            "island",
+            "culmen_length_mm",
+            "culmen_depth_mm",
+            "flipper_length_mm",
+            "sex",
+        ]
+    ]
+    y_train = df[["body_mass_g"]]
+    model.fit(X_train, y_train)
+
+    # Check score to ensure the model was fitted
+    result = model.score(X_train, y_train).to_pandas()
+    utils.check_pandas_df_schema_and_index(
+        result, columns=utils.ML_REGRESSION_METRICS, index=1
+    )
+
+    # save, load, check parameters to ensure configuration was kept
+    reloaded_model = model.to_gbq(f"{dataset_id}.temp_configured_model", replace=True)
+    assert reloaded_model._bqml_model is not None
+    assert (
+        f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name
+    )
+    assert reloaded_model.optimize_strategy == "NORMAL_EQUATION"
+    assert reloaded_model.fit_intercept is True
+    assert reloaded_model.calculate_p_values is False
+    assert reloaded_model.enable_global_explain is False
+    assert reloaded_model.l1_reg is None
+    assert reloaded_model.l2_reg == 0.0
+    assert reloaded_model.learning_rate is None
+    assert reloaded_model.learning_rate_strategy == "line_search"
+    assert reloaded_model.ls_init_learning_rate is None
+    assert reloaded_model.max_iterations == 20
+    assert reloaded_model.tol == 0.01
+
+
 # TODO(garrettwu): add tests for param warm_start. Requires a trained model.
 
 
diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py
@@ -201,6 +201,20 @@ def test_null_index_stack(scalars_df_null_index, scalars_pandas_df_default_index
     )
 
 
+def test_null_index_series_self_join(
+    scalars_df_null_index, scalars_pandas_df_default_index
+):
+    bf_result = scalars_df_null_index[["int64_col"]].join(
+        scalars_df_null_index[["int64_too"]]
+    )
+    pd_result = scalars_pandas_df_default_index[["int64_col"]].join(
+        scalars_pandas_df_default_index[["int64_too"]]
+    )
+    pd.testing.assert_frame_equal(
+        bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False
+    )
+
+
 def test_null_index_series_self_aligns(
     scalars_df_null_index, scalars_pandas_df_default_index
 ):
diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py
@@ -78,6 +78,7 @@ def mock_X(mock_y, mock_session):
         ["index_column_label"],
     )
     mock_X.join(mock_y).sql = "input_X_y_sql"
+    mock_X.join(mock_y).cache.return_value = mock_X.join(mock_y)
     mock_X.join(mock_y)._to_sql_query.return_value = (
         "input_X_y_sql",
         ["index_column_id"],

Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,7 @@ def mock_X(mock_y, mock_session):`
`78`	`78`	`["index_column_label"],`
`79`	`79`	`)`
`80`	`80`	`mock_X.join(mock_y).sql = "input_X_y_sql"`
	`81`	`+ mock_X.join(mock_y).cache.return_value = mock_X.join(mock_y)`
`81`	`82`	`mock_X.join(mock_y)._to_sql_query.return_value = (`
`82`	`83`	`"input_X_y_sql",`
`83`	`84`	`["index_column_id"],`