fix: Now resetting indexes for regression datasets when using group folds

Daniel Grindrod · Daniel Grindrod · commit bda63b2bcad9 · 2024-12-06T10:49:20.000Z
diff --git a/flaml/automl/task/generic_task.py b/flaml/automl/task/generic_task.py
@@ -442,8 +442,8 @@ def prepare_data(
                 X_train_all, y_train_all = shuffle(X_train_all, y_train_all, random_state=RANDOM_SEED)
             if data_is_df:
                 X_train_all.reset_index(drop=True, inplace=True)
-            if isinstance(y_train_all, pd.Series):
-                y_train_all.reset_index(drop=True, inplace=True)
+        if split_type in ["group", "uniform", "stratified"] and isinstance(y_train_all, pd.Series):
+            y_train_all.reset_index(drop=True, inplace=True)
 
         X_train, y_train = X_train_all, y_train_all
         state.groups_all = state.groups
diff --git a/test/automl/test_split.py b/test/automl/test_split.py
@@ -1,4 +1,5 @@
-from sklearn.datasets import fetch_openml
+import numpy as np
+from sklearn.datasets import fetch_openml, load_iris
 from sklearn.metrics import accuracy_score
 from sklearn.model_selection import GroupKFold, KFold, train_test_split
 
@@ -48,7 +49,7 @@ def test_time():
     _test(split_type="time")
 
 
-def test_groups():
+def test_groups_for_classification_task():
     from sklearn.externals._arff import ArffException
 
     try:
@@ -88,6 +89,35 @@ def test_groups():
     automl.fit(X, y, **automl_settings)
 
 
+def test_groups_for_regression_task():
+    """Append nonsensical groups to iris dataset and use it to test that GroupKFold works for regression tasks"""
+    iris_dict_data = load_iris(as_frame=True)  # numpy arrays
+    iris_data = iris_dict_data["frame"]  # pandas dataframe data + target
+
+    rng = np.random.default_rng(42)
+    iris_data["cluster"] = rng.integers(
+        low=0, high=5, size=iris_data.shape[0]
+    )  # np.random.randint(0, 5, iris_data.shape[0])
+
+    automl = AutoML()
+    X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy()
+    y = iris_data["petal width (cm)"]
+    X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split(
+        X, y, iris_data["cluster"], random_state=42
+    )
+    automl_settings = {
+        "max_iter": 5,
+        "time_budget": -1,
+        "metric": "r2",
+        "task": "regression",
+        "estimator_list": ["lgbm", "rf", "xgboost", "kneighbor"],
+        "eval_method": "cv",
+        "split_type": "uniform",
+        "groups": groups_train,
+    }
+    automl.fit(X_train, y_train, **automl_settings)
+
+
 def test_stratified_groupkfold():
     from minio.error import ServerError
     from sklearn.model_selection import StratifiedGroupKFold
@@ -203,4 +233,4 @@ def get_n_splits(self, X=None, y=None, groups=None):
 
 
 if __name__ == "__main__":
-    test_groups()
+    test_groups_for_classification_task()