deeppavlov · voorhs · Jan 24, 2025 · Jan 20, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/autointent/_dataset/_dataset.py b/autointent/_dataset/_dataset.py
@@ -6,23 +6,23 @@
 from pathlib import Path
 from typing import Any, TypedDict
 
-from datasets import ClassLabel, Sequence, concatenate_datasets, get_dataset_config_names, load_dataset
 from datasets import Dataset as HFDataset
+from datasets import Sequence, get_dataset_config_names, load_dataset
 
-from autointent.custom_types import LabelType, Split
+from autointent.custom_types import LabelWithOOS, Split
 from autointent.schemas import Intent, Tag
 
 
 class Sample(TypedDict):
     """
     Typed dictionary representing a dataset sample.
 
-    :param str utterance: The text of the utterance.
-    :param LabelType | None label: The label associated with the utterance, or None if out-of-scope.
+    :param utterance: The text of the utterance.
+    :param label: The label associated with the utterance, or None if out-of-scope.
     """
 
     utterance: str
-    label: LabelType | None
+    label: LabelWithOOS
 
 
 class Dataset(dict[str, HFDataset]):
@@ -39,7 +39,7 @@ class Dataset(dict[str, HFDataset]):
 
     def __init__(self, *args: Any, intents: list[Intent], **kwargs: Any) -> None:  # noqa: ANN401
         """
-        Initialize the dataset and configure OOS split if applicable.
+        Initialize the dataset.
 
         :param args: Positional arguments to initialize the dataset.
         :param intents: List of intents associated with the dataset.
@@ -49,15 +49,6 @@ def __init__(self, *args: Any, intents: list[Intent], **kwargs: Any) -> None:  #
 
         self.intents = intents
 
-        self._encoded_labels = False
-
-        if self.multilabel:
-            self._encode_labels()
-
-        oos_split = self._create_oos_split()
-        if oos_split is not None:
-            self[Split.OOS] = oos_split
-
     @property
     def multilabel(self) -> bool:
         """
@@ -125,7 +116,6 @@ def to_multilabel(self) -> "Dataset":
         """
         for split_name, split in self.items():
             self[split_name] = split.map(self._to_multilabel)
-        self._encode_labels()
         return self
 
     def to_dict(self) -> dict[str, list[dict[str, Any]]]:
@@ -144,7 +134,10 @@ def to_json(self, filepath: str | Path) -> None:
 
         :param filepath: The path to the file where the JSON data will be saved.
         """
-        with Path(filepath).open("w") as file:
+        path = Path(filepath)
+        if not path.parent.exists():
+            path.parent.mkdir(parents=True)
+        with path.open("w") as file:
             json.dump(self.to_dict(), file, indent=4, ensure_ascii=False)
 
     def push_to_hub(self, repo_id: str, private: bool = False) -> None:
@@ -181,38 +174,15 @@ def get_n_classes(self, split: str) -> int:
         """
         classes = set()
         for label in self[split][self.label_feature]:
-            match (label, self._encoded_labels):
-                case (int(), _):
+            match label:
+                case int():
                     classes.add(label)
-                case (list(), False):
-                    for label_ in label:
-                        classes.add(label_)
-                case (list(), True):
+                case list():
                     for idx, label_ in enumerate(label):
                         if label_:
                             classes.add(idx)
         return len(classes)
 
-    def _encode_labels(self) -> "Dataset":
-        """
-        Encode dataset labels into one-hot or multilabel format.
-
-        :return: Self, with labels encoded.
-        """
-        for split_name, split in self.items():
-            self[split_name] = split.map(self._encode_label)
-        self._encoded_labels = True
-        return self
-
-    def _is_oos(self, sample: Sample) -> bool:
-        """
-        Check if a sample is out-of-scope.
-
-        :param sample: The sample to check.
-        :return: True if the sample is out-of-scope, False otherwise.
-        """
-        return sample["label"] is None
-
     def _to_multilabel(self, sample: Sample) -> Sample:
         """
         Convert a sample's label to multilabel format.
@@ -221,50 +191,7 @@ def _to_multilabel(self, sample: Sample) -> Sample:
         :return: Sample with label in multilabel format.
         """
         if isinstance(sample["label"], int):
-            sample["label"] = [sample["label"]]
-        return sample
-
-    def _encode_label(self, sample: Sample) -> Sample:
-        """
-        Encode a sample's label as a one-hot vector.
-
-        :param sample: The sample to encode.
-        :return: Sample with encoded label.
-        """
-        one_hot_label = [0] * self.n_classes
-        match sample["label"]:
-            case int():
-                one_hot_label[sample["label"]] = 1
-            case list():
-                for idx in sample["label"]:
-                    one_hot_label[idx] = 1
-        sample["label"] = one_hot_label
+            ohe_vector = [0] * self.n_classes
+            ohe_vector[sample["label"]] = 1
+            sample["label"] = ohe_vector
         return sample
-
-    def _create_oos_split(self) -> HFDataset | None:
-        """
-        Create an out-of-scope (OOS) split from the dataset.
-
-        :return: The OOS split if created, None otherwise.
-        """
-        oos_splits = [split.filter(self._is_oos) for split in self.values()]
-        oos_splits = [oos_split for oos_split in oos_splits if oos_split.num_rows]
-        if oos_splits:
-            for split_name, split in self.items():
-                self[split_name] = split.filter(lambda sample: not self._is_oos(sample))
-            return concatenate_datasets(oos_splits)
-        return None
-
-    def _cast_label_feature(self) -> None:
-        """Cast the label feature of the dataset to the appropriate type."""
-        for split_name, split in self.items():
-            new_features = split.features.copy()
-            if self.multilabel:
-                new_features[self.label_feature] = Sequence(
-                    ClassLabel(num_classes=self.n_classes),
-                )
-            else:
-                new_features[self.label_feature] = ClassLabel(
-                    num_classes=self.n_classes,
-                )
-            self[split_name] = split.cast(new_features)
diff --git a/autointent/_dataset/_validation.py b/autointent/_dataset/_validation.py
@@ -66,18 +66,9 @@ def validate_dataset(self) -> "DatasetReader":
         ]
         splits = [split for split in splits if split]
 
-        n_classes = [self._get_n_classes(split) for split in splits]
-        if len(set(n_classes)) != 1:
-            message = (
-                f"Mismatch in number of classes across splits. Found class counts: {n_classes}. "
-                "Ensure all splits have the same number of classes."
-            )
-            raise ValueError(message)
-        if not n_classes[0]:
-            message = "Number of classes is zero or undefined. " "Ensure at least one class is present in the splits."
-            raise ValueError(message)
+        n_classes = self._validate_classes(splits)
 
-        self._validate_intents(n_classes[0])
+        self._validate_intents(n_classes)
 
         for split in splits:
             self._validate_split(split)
@@ -100,6 +91,20 @@ def _get_n_classes(self, split: list[Sample]) -> int:
                         classes.add(label)
         return len(classes)
 
+    def _validate_classes(self, splits: list[list[Sample]]) -> int:
+        """Validate that each split has all classes."""
+        n_classes = [self._get_n_classes(split) for split in splits]
+        if len(set(n_classes)) != 1:
+            message = (
+                f"Mismatch in number of classes across splits. Found class counts: {n_classes}. "
+                "Ensure all splits have the same number of classes."
+            )
+            raise ValueError(message)
+        if not n_classes[0]:
+            message = "Number of classes is zero or undefined. " "Ensure at least one class is present in the splits."
+            raise ValueError(message)
+        return n_classes[0]
+
     def _validate_intents(self, n_classes: int) -> "DatasetReader":
         """
         Validate the intents by checking their IDs for sequential order.
@@ -132,7 +137,8 @@ def _validate_split(self, split: list[Sample]) -> "DatasetReader":
         intent_ids = {intent.id for intent in self.intents}
         for sample in split:
             message = (
-                f"Sample with label {sample.label} references a non-existent intent ID. " f"Valid IDs are {intent_ids}."
+                f"Sample with label {sample.label} and utterance {sample.utterance[:10]}... "
+                f"references a non-existent intent ID. Valid IDs are {intent_ids}."
             )
             if isinstance(sample.label, int) and sample.label not in intent_ids:
                 raise ValueError(message)

diff --git a/autointent/_pipeline/_pipeline.py b/autointent/_pipeline/_pipeline.py
@@ -3,21 +3,23 @@
 import json
 import logging
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
-import numpy.typing as npt
 import yaml
 
 from autointent import Context, Dataset
 from autointent.configs import CrossEncoderConfig, EmbedderConfig, InferenceNodeConfig, LoggingConfig, VectorIndexConfig
-from autointent.custom_types import NodeType
+from autointent.custom_types import ListOfGenericLabels, NodeType
 from autointent.metrics import PREDICTION_METRICS_MULTILABEL
 from autointent.nodes import InferenceNode, NodeOptimizer
 from autointent.utils import load_default_search_space, load_search_space
 
 from ._schemas import InferencePipelineOutput, InferencePipelineUtteranceOutput
 
+if TYPE_CHECKING:
+    from autointent.modules.abc import DecisionModule, ScoringModule
+
 
 class Pipeline:
     """Pipeline optimizer class."""
@@ -185,7 +187,7 @@ def load(cls, path: str | Path) -> "Pipeline":
             inference_dict_config = yaml.safe_load(file)
         return cls.from_dict_config(inference_dict_config["nodes_configs"])
 
-    def predict(self, utterances: list[str]) -> npt.NDArray[Any]:
+    def predict(self, utterances: list[str]) -> ListOfGenericLabels:
         """
         Predict the labels for the utterances.
 
@@ -196,8 +198,11 @@ def predict(self, utterances: list[str]) -> npt.NDArray[Any]:
             msg = "Pipeline in optimization mode cannot perform inference"
             raise RuntimeError(msg)
 
-        scores = self.nodes[NodeType.scoring].module.predict(utterances)  # type: ignore[union-attr]
-        return self.nodes[NodeType.decision].module.predict(scores)  # type: ignore[union-attr]
+        scoring_module: ScoringModule = self.nodes[NodeType.scoring].module  # type: ignore[assignment,union-attr]
+        decision_module: DecisionModule = self.nodes[NodeType.decision].module  # type: ignore[assignment,union-attr]
+
+        scores = scoring_module.predict(utterances)
+        return decision_module.predict(scores)
 
     def predict_with_metadata(self, utterances: list[str]) -> InferencePipelineOutput:
         """
@@ -211,7 +216,7 @@ def predict_with_metadata(self, utterances: list[str]) -> InferencePipelineOutpu
             raise RuntimeError(msg)
 
         scores, scores_metadata = self.nodes[NodeType.scoring].module.predict_with_metadata(utterances)  # type: ignore[union-attr]
-        predictions = self.nodes[NodeType.decision].module.predict(scores)  # type: ignore[union-attr]
+        predictions = self.nodes[NodeType.decision].module.predict(scores)  # type: ignore[union-attr,arg-type]
         regexp_predictions, regexp_predictions_metadata = None, None
         if NodeType.regexp in self.nodes:
             regexp_predictions, regexp_predictions_metadata = self.nodes[NodeType.regexp].module.predict_with_metadata(  # type: ignore[union-attr]

diff --git a/autointent/_pipeline/_schemas.py b/autointent/_pipeline/_schemas.py
@@ -2,15 +2,15 @@
 
 from pydantic import BaseModel
 
-from autointent.custom_types import LabelType
+from autointent.custom_types import LabelWithOOS, ListOfLabels, ListOfLabelsWithOOS
 
 
 class InferencePipelineUtteranceOutput(BaseModel):
     """Output of the inference pipeline for a single utterance."""
 
     utterance: str
-    prediction: LabelType
-    regexp_prediction: LabelType | None
+    prediction: LabelWithOOS
+    regexp_prediction: LabelWithOOS
     regexp_prediction_metadata: Any
     score: list[float]
     score_metadata: Any
@@ -19,6 +19,6 @@ class InferencePipelineUtteranceOutput(BaseModel):
 class InferencePipelineOutput(BaseModel):
     """Output of the inference pipeline."""
 
-    predictions: list[LabelType]
-    regexp_predictions: list[LabelType] | None = None
+    predictions: ListOfLabelsWithOOS
+    regexp_predictions: ListOfLabels | None = None
     utterances: list[InferencePipelineUtteranceOutput] | None = None
diff --git a/autointent/_ranker.py b/autointent/_ranker.py
@@ -18,7 +18,7 @@
 from sklearn.linear_model import LogisticRegressionCV
 from torch import nn
 
-from autointent.custom_types import LabelType
+from autointent.custom_types import ListOfLabels
 
 logger = logging.getLogger(__name__)
 
@@ -158,7 +158,7 @@ def _get_features_or_predictions(self, pairs: list[tuple[str, str]]) -> npt.NDAr
         self._activations_list.clear()
         return res  # type: ignore[no-any-return]
 
-    def _fit(self, pairs: list[tuple[str, str]], labels: list[LabelType]) -> None:
+    def _fit(self, pairs: list[tuple[str, str]], labels: ListOfLabels) -> None:
         """
         Train the logistic regression model on cross-encoder features.
 
@@ -181,7 +181,7 @@ def _fit(self, pairs: list[tuple[str, str]], labels: list[LabelType]) -> None:
 
         self._clf = clf
 
-    def fit(self, utterances: list[str], labels: list[LabelType]) -> None:
+    def fit(self, utterances: list[str], labels: ListOfLabels) -> None:
         """
         Construct training samples and train the logistic regression classifier.