Make Class Label Retrieval More Lenient (#1315)

LennartPurucker · eddiebergman · web-flow · commit b22a5e397521 · 2024-01-12T22:28:12.000+01:00
* mark production tests

* make production test run

* fix test bug -1/N

* add retry raise again after refactor

* fix str dict representation

* test: Fix non-writable home mocks

* testing: not not a change

* testing: trigger CI

* typing: Update typing

* ci: Update testing matrix

* testing: Fixup run flow error check

* ci: Manual dispatch, disable double testing

* ci: Prevent further ci duplication

* ci: Add concurrency checks to all

* ci: Remove the max-parallel on test ci

There are a lot less now and they cancel previous
puhes in the same pr now so it shouldn't be a problem anymore

* testing: Fix windows path generation

* add pytest for server state

* add assert cache state

* some formatting

* fix with cache fixture

* finally remove th finally

* doc: Fix link

* update test matrix

* doc: Update to just point to contributing

* add linkcheck ignore for test server

* add special case for class labels that are dtype string

* fix bug and add test

* formatting

---------

Co-authored-by: eddiebergman &lt;eddiebergmanhs@gmail.com&gt;
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -908,8 +908,18 @@ def retrieve_class_labels(self, target_name: str = "class") -> None | list[str]:
         list
         """
         for feature in self.features.values():
-            if (feature.name == target_name) and (feature.data_type == "nominal"):
-                return feature.nominal_values
+            if feature.name == target_name:
+                if feature.data_type == "nominal":
+                    return feature.nominal_values
+
+                if feature.data_type == "string":
+                    # Rel.: #1311
+                    # The target is invalid for a classification task if the feature type is string
+                    # and not nominal. For such miss-configured tasks, we silently fix it here as
+                    # we can safely interpreter string as nominal.
+                    df, *_ = self.get_data()
+                    return list(df[feature.name].unique())
+
         return None
 
     def get_features_by_type(  # noqa: C901
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -626,11 +626,18 @@ def test__retrieve_class_labels(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels()
         assert labels == ["1", "2", "3", "4", "5", "U"]
+
         labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels(
             target_name="product-type",
         )
         assert labels == ["C", "H", "G"]
 
+        # Test workaround for string-typed class labels
+        custom_ds = openml.datasets.get_dataset(2, download_data=False)
+        custom_ds.features[31].data_type = "string"
+        labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
+        assert labels == ["COIL", "SHEET"]
+
     def test_upload_dataset_with_url(self):
         dataset = OpenMLDataset(
             "%s-UploadTestWithURL" % self._get_sentinel(),