employ better multi-label stratifier than skmultilearn

voorhs · voorhs · commit 7c5c65f65117 · 2025-10-23T20:15:01.000+03:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,7 +33,7 @@ requires-python = ">=3.10,<3.13"
 dependencies = [
     "sentence-transformers (>=3,<4)",
     "scikit-learn (>=1.5,<2.0)",
-    "scikit-multilearn (==0.2.0)",
+    "iterative-stratification (>=0.1.9)",
     "appdirs (>=1.4,<2.0)",
     "optuna (>=4.0.0,<5.0.0)",
     "pathlib (>=1.0.1,<2.0.0)",
@@ -253,7 +253,7 @@ module = [
     "xeger",
     "appdirs",
     "sre_yield",
-    "skmultilearn.model_selection",
+    "iterstrat.ml_stratifiers",
     "hydra",
     "hydra.*",
     "transformers",
diff --git a/src/autointent/context/data_handler/_stratification.py b/src/autointent/context/data_handler/_stratification.py
@@ -5,15 +5,14 @@
 """
 
 import logging
-import random
 from collections.abc import Sequence
 
 import numpy as np
 from datasets import Dataset as HFDataset
 from datasets import concatenate_datasets
+from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
 from numpy import typing as npt
 from sklearn.model_selection import train_test_split
-from skmultilearn.model_selection import IterativeStratification
 
 from autointent import Dataset
 from autointent.custom_types import LabelType
@@ -155,13 +154,10 @@ def _split_multilabel(self, dataset: HFDataset, test_size: float) -> Sequence[np
         Returns:
             A sequence containing indices for train and test splits.
         """
-        if self.random_seed is not None:
-            # Set all seeds for reproducibility (workaround for bugs in IterativeStratification from skmultilearn)
-            random.seed(self.random_seed)
-        splitter = IterativeStratification(
-            n_splits=2,
-            order=2,
-            sample_distribution_per_fold=[test_size, 1.0 - test_size],
+        splitter = MultilabelStratifiedShuffleSplit(
+            n_splits=1,
+            test_size=test_size,
+            random_state=self.random_seed,
         )
         return next(splitter.split(np.arange(len(dataset)), np.array(dataset[self.label_feature])))
 
diff --git a/tests/data/test_data_handler.py b/tests/data/test_data_handler.py
@@ -89,13 +89,13 @@ def test_data_handler_multilabel_mode(sample_multilabel_data):
     assert handler.multilabel is True
     assert handler.dataset.n_classes == 2
     assert handler.train_utterances(0) == [
-        "hey, how's it going?",
+        "farewell and see you later",
+        "good morning",
         "so long and take care",
-        "hello, nice to meet you",
-        "later, see you soon",
+        "greetings and salutations",
     ]
     assert handler.test_utterances() == ["greetings", "farewell"]
-    assert handler.train_labels(0) == [[1, 0], [0, 1], [0, 1], [1, 0]]
+    assert handler.train_labels(0) == [[0, 1], [1, 0], [0, 1], [1, 0]]
     assert handler.test_labels() == [[0, 1], [1, 0]]
 
 
@@ -239,6 +239,6 @@ def test_few_shot_split(dataset):
     }
 
     for data_split in dh.dataset:
-        assert (
-            Counter(dh.dataset[data_split][dh.dataset.label_feature]) == desired_specs[data_split]
-        ), f"Failed for {data_split}"
+        assert Counter(dh.dataset[data_split][dh.dataset.label_feature]) == desired_specs[data_split], (
+            f"Failed for {data_split}"
+        )
diff --git a/tests/data/test_stratificaiton.py b/tests/data/test_stratificaiton.py
@@ -38,8 +38,8 @@ def test_multilabel_train_test_split(dataset_unsplitted):
 
     assert Split.TRAIN in dataset
     assert Split.TEST in dataset
-    assert dataset[Split.TRAIN].num_rows == 18
-    assert dataset[Split.TEST].num_rows == 18
+    assert dataset[Split.TRAIN].num_rows == 19
+    assert dataset[Split.TEST].num_rows == 17
     assert dataset.get_n_classes(Split.TRAIN) == dataset.get_n_classes(Split.TEST)