fix: KeyError no longer occurs when using groupfolds for regression tasks. (#1385)

dannycg1996 · Daniel Grindrod · thinkall · web-flow · commit c038fbca0747 · 2024-12-18T10:06:58.000+08:00
* fix: Now resetting indexes for regression datasets when using group folds

* refactor: Simplified if statement to include all fold types

* docs: Updated docs to make it clear that group folds can be used for regression tasks

---------

Co-authored-by: Daniel Grindrod &lt;daniel.grindrod@evotec.com&gt;
Co-authored-by: Li Jiang &lt;bnujli@gmail.com&gt;
diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
@@ -203,7 +203,7 @@ def custom_metric(
                 * Valid str options depend on different tasks.
                 For classification tasks, valid choices are
                     ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
-                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+                For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
                     "auto" -> uniform.
                 For time series forecast tasks, must be "auto" or 'time'.
                 For ranking task, must be "auto" or 'group'.
@@ -739,7 +739,7 @@ def retrain_from_log(
                 * Valid str options depend on different tasks.
                 For classification tasks, valid choices are
                     ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
-                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+                For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
                     "auto" -> uniform.
                 For time series forecast tasks, must be "auto" or 'time'.
                 For ranking task, must be "auto" or 'group'.
@@ -1358,7 +1358,7 @@ def custom_metric(
                 * Valid str options depend on different tasks.
                 For classification tasks, valid choices are
                     ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
-                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+                For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
                     "auto" -> uniform.
                 For time series forecast tasks, must be "auto" or 'time'.
                 For ranking task, must be "auto" or 'group'.
diff --git a/flaml/automl/task/generic_task.py b/flaml/automl/task/generic_task.py
@@ -442,8 +442,8 @@ def prepare_data(
                 X_train_all, y_train_all = shuffle(X_train_all, y_train_all, random_state=RANDOM_SEED)
             if data_is_df:
                 X_train_all.reset_index(drop=True, inplace=True)
-            if isinstance(y_train_all, pd.Series):
-                y_train_all.reset_index(drop=True, inplace=True)
+        if isinstance(y_train_all, pd.Series):
+            y_train_all.reset_index(drop=True, inplace=True)
 
         X_train, y_train = X_train_all, y_train_all
         state.groups_all = state.groups
diff --git a/flaml/automl/task/task.py b/flaml/automl/task/task.py
@@ -192,7 +192,7 @@ def prepare_data(
                 * Valid str options depend on different tasks.
                 For classification tasks, valid choices are
                     ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
-                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+                For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
                     "auto" -> uniform.
                 For time series forecast tasks, must be "auto" or 'time'.
                 For ranking task, must be "auto" or 'group'.
diff --git a/test/automl/test_split.py b/test/automl/test_split.py
@@ -1,4 +1,5 @@
-from sklearn.datasets import fetch_openml
+import numpy as np
+from sklearn.datasets import fetch_openml, load_iris
 from sklearn.metrics import accuracy_score
 from sklearn.model_selection import GroupKFold, KFold, train_test_split
 
@@ -48,7 +49,7 @@ def test_time():
     _test(split_type="time")
 
 
-def test_groups():
+def test_groups_for_classification_task():
     from sklearn.externals._arff import ArffException
 
     try:
@@ -88,6 +89,35 @@ def test_groups():
     automl.fit(X, y, **automl_settings)
 
 
+def test_groups_for_regression_task():
+    """Append nonsensical groups to iris dataset and use it to test that GroupKFold works for regression tasks"""
+    iris_dict_data = load_iris(as_frame=True)  # numpy arrays
+    iris_data = iris_dict_data["frame"]  # pandas dataframe data + target
+
+    rng = np.random.default_rng(42)
+    iris_data["cluster"] = rng.integers(
+        low=0, high=5, size=iris_data.shape[0]
+    )  # np.random.randint(0, 5, iris_data.shape[0])
+
+    automl = AutoML()
+    X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy()
+    y = iris_data["petal width (cm)"]
+    X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split(
+        X, y, iris_data["cluster"], random_state=42
+    )
+    automl_settings = {
+        "max_iter": 5,
+        "time_budget": -1,
+        "metric": "r2",
+        "task": "regression",
+        "estimator_list": ["lgbm", "rf", "xgboost", "kneighbor"],
+        "eval_method": "cv",
+        "split_type": "uniform",
+        "groups": groups_train,
+    }
+    automl.fit(X_train, y_train, **automl_settings)
+
+
 def test_stratified_groupkfold():
     from minio.error import ServerError
     from sklearn.model_selection import StratifiedGroupKFold
@@ -204,4 +234,4 @@ def get_n_splits(self, X=None, y=None, groups=None):
 
 
 if __name__ == "__main__":
-    test_groups()
+    test_groups_for_classification_task()