deeppavlov
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 0 additions & 35 deletions b/‎CONTRIBUTING.md‎
Lines changed: 0 additions & 35 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autointent/__init__.py‎
Lines changed: 16 additions & 3 deletions b/‎autointent/__init__.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎autointent/_callbacks/base.py‎
Lines changed: 8 additions & 0 deletions b/‎autointent/_callbacks/base.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎autointent/_callbacks/callback_handler.py‎
Lines changed: 8 additions & 0 deletions b/‎autointent/_callbacks/callback_handler.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎autointent/_callbacks/tensorboard.py‎
Lines changed: 16 additions & 0 deletions b/‎autointent/_callbacks/tensorboard.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎autointent/_callbacks/wandb.py‎
Lines changed: 8 additions & 0 deletions b/‎autointent/_callbacks/wandb.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎autointent/_dataset/_dataset.py‎
Lines changed: 12 additions & 88 deletions b/‎autointent/_dataset/_dataset.py‎
Lines changed: 12 additions & 88 deletions
diff --git a/‎autointent/_dataset/_validation.py‎
Lines changed: 18 additions & 12 deletions b/‎autointent/_dataset/_validation.py‎
Lines changed: 18 additions & 12 deletions
@@ -50,41 +50,6 @@ make lint
 
 ![](assets/dependency-graph.png)
 
-## Настройка логгера
-Чтобы видеть debug строчки у вас есть несколько опций:
-
-1. Включить весь debug output через опцию коммандной строки: 
-```bash 
-autointent hydra.verbose=true
-```
-2. Включить debug output только для определенных модулей, пример для autointent.pipeline.optimization.cli_endpoint и самой hydra: 
-```bash
-autointent hydra.verbose=[hydra,autointent/pipeline/optimization/cli_endpoint] hydra.job_logging.root.level=DEBUG
-```
-
-Само конфигурирование логгера сделано в autointent.configs.optimization_cli.logger_config. Вы можете изменить любой параметр логгера через коммандную строку. Вот пример, как поменять уровень логгера на ERROR:
-```bash
-autointent hydra.job_logging.root.level=ERROR
-```
-
-Еще можно изменить параметры логгера через yaml файлы:
-1. Создадим папку с конфиг. файлами: test_config
-2. test_config/config.yaml:
-```yaml
-defaults:
-  - optimization_config
-  - _self_
-  - override hydra/job_logging: custom
-  
-# set your config params for optimization here
-embedder_batch_size: 32
-```
-3. Поместите конфигурацию логгера в test_config/hydra/job_logging/custom.yaml (параметры см. [здесь](https://docs.python.org/3/howto/logging.html))
-4. Запускаем с конфиг файлом config.yaml:
-```bash
-autointent --config-path FULL_PATH/test_config --config-name config
-```
-
 ## Построение документации
 
 Построить html версию в папке `docs/build`:
 
@@ -24,7 +24,7 @@ lint:
 
 .PHONY: sync
 sync:
-	poetry sync
+	poetry sync --with dev,test,typing,docs
 
 .PHONY: docs
 docs:
 
@@ -1,9 +1,22 @@
 """This is AutoIntent API reference."""
 
-from ._embedder import Embedder
 from ._dataset import Dataset
+from ._embedder import Embedder
 from ._hash import Hasher
-from .context import Context, load_dataset
+from ._logging import setup_logging
 from ._pipeline import Pipeline
+from ._ranker import Ranker
+from ._vector_index import VectorIndex
+from .context import Context, load_dataset
 
-__all__ = ["Context", "Dataset", "Embedder", "Hasher", "Pipeline", "load_dataset"]
+__all__ = [
+    "Context",
+    "Dataset",
+    "Embedder",
+    "Hasher",
+    "Pipeline",
+    "Ranker",
+    "VectorIndex",
+    "load_dataset",
+    "setup_logging",
+]
@@ -42,6 +42,14 @@ def log_value(self, **kwargs: dict[str, Any]) -> None:
         :param kwargs: Data to log.
         """
 
+    @abstractmethod
+    def log_metrics(self, metrics: dict[str, Any]) -> None:
+        """
+        Log metrics during training.
+
+        :param metrics: Metrics to log.
+        """
+
     @abstractmethod
     def end_module(self) -> None:
         """End a module."""
 
@@ -44,6 +44,14 @@ def log_value(self, **kwargs: dict[str, Any]) -> None:
         """
         self.call_events("log_value", **kwargs)
 
+    def log_metrics(self, metrics: dict[str, Any]) -> None:
+        """
+        Log metrics during training.
+
+        :param metrics: Metrics to log.
+        """
+        self.call_events("log_metrics", metrics=metrics)
+
     def end_module(self) -> None:
         """End a module."""
         self.call_events("end_module")
 
@@ -73,6 +73,22 @@ def log_value(self, **kwargs: dict[str, Any]) -> None:
             else:
                 self.module_writer.add_text(key, str(value))  # type: ignore[no-untyped-call]
 
+    def log_metrics(self, metrics: dict[str, Any]) -> None:
+        """
+        Log metrics during training.
+
+        :param metrics: Metrics to log.
+        """
+        if self.module_writer is None:
+            msg = "start_run must be called before log_value."
+            raise RuntimeError(msg)
+
+        for key, value in metrics.items():
+            if isinstance(value, int | float):
+                self.module_writer.add_scalar(key, value)  # type: ignore[no-untyped-call]
+            else:
+                self.module_writer.add_text(key, str(value))  # type: ignore[no-untyped-call]
+
     def log_final_metrics(self, metrics: dict[str, Any]) -> None:
         """
         Log final metrics.
 
@@ -59,6 +59,14 @@ def log_value(self, **kwargs: dict[str, Any]) -> None:
         """
         self.wandb.log(kwargs)
 
+    def log_metrics(self, metrics: dict[str, Any]) -> None:
+        """
+        Log metrics during training.
+
+        :param metrics: Metrics to log.
+        """
+        self.wandb.log(metrics)
+
     def log_final_metrics(self, metrics: dict[str, Any]) -> None:
         """
         Log final metrics.
 
@@ -6,23 +6,23 @@
 from pathlib import Path
 from typing import Any, TypedDict
 
-from datasets import ClassLabel, Sequence, concatenate_datasets, get_dataset_config_names, load_dataset
 from datasets import Dataset as HFDataset
+from datasets import Sequence, get_dataset_config_names, load_dataset
 
-from autointent.custom_types import LabelType, Split
+from autointent.custom_types import LabelWithOOS, Split
 from autointent.schemas import Intent, Tag
 
 
 class Sample(TypedDict):
     """
     Typed dictionary representing a dataset sample.
 
-    :param str utterance: The text of the utterance.
-    :param LabelType | None label: The label associated with the utterance, or None if out-of-scope.
+    :param utterance: The text of the utterance.
+    :param label: The label associated with the utterance, or None if out-of-scope.
     """
 
     utterance: str
-    label: LabelType | None
+    label: LabelWithOOS
 
 
 class Dataset(dict[str, HFDataset]):
@@ -39,7 +39,7 @@ class Dataset(dict[str, HFDataset]):
 
     def __init__(self, *args: Any, intents: list[Intent], **kwargs: Any) -> None:  # noqa: ANN401
         """
-        Initialize the dataset and configure OOS split if applicable.
+        Initialize the dataset.
 
         :param args: Positional arguments to initialize the dataset.
         :param intents: List of intents associated with the dataset.
@@ -49,15 +49,6 @@ def __init__(self, *args: Any, intents: list[Intent], **kwargs: Any) -> None:  #
 
         self.intents = intents
 
-        self._encoded_labels = False
-
-        if self.multilabel:
-            self._encode_labels()
-
-        oos_split = self._create_oos_split()
-        if oos_split is not None:
-            self[Split.OOS] = oos_split
-
     @property
     def multilabel(self) -> bool:
         """
@@ -125,7 +116,6 @@ def to_multilabel(self) -> "Dataset":
         """
         for split_name, split in self.items():
             self[split_name] = split.map(self._to_multilabel)
-        self._encode_labels()
         return self
 
     def to_dict(self) -> dict[str, list[dict[str, Any]]]:
@@ -184,38 +174,15 @@ def get_n_classes(self, split: str) -> int:
         """
         classes = set()
         for label in self[split][self.label_feature]:
-            match (label, self._encoded_labels):
-                case (int(), _):
+            match label:
+                case int():
                     classes.add(label)
-                case (list(), False):
-                    for label_ in label:
-                        classes.add(label_)
-                case (list(), True):
+                case list():
                     for idx, label_ in enumerate(label):
                         if label_:
                             classes.add(idx)
         return len(classes)
 
-    def _encode_labels(self) -> "Dataset":
-        """
-        Encode dataset labels into one-hot or multilabel format.
-
-        :return: Self, with labels encoded.
-        """
-        for split_name, split in self.items():
-            self[split_name] = split.map(self._encode_label)
-        self._encoded_labels = True
-        return self
-
-    def _is_oos(self, sample: Sample) -> bool:
-        """
-        Check if a sample is out-of-scope.
-
-        :param sample: The sample to check.
-        :return: True if the sample is out-of-scope, False otherwise.
-        """
-        return sample["label"] is None
-
     def _to_multilabel(self, sample: Sample) -> Sample:
         """
         Convert a sample's label to multilabel format.
@@ -224,50 +191,7 @@ def _to_multilabel(self, sample: Sample) -> Sample:
         :return: Sample with label in multilabel format.
         """
         if isinstance(sample["label"], int):
-            sample["label"] = [sample["label"]]
-        return sample
-
-    def _encode_label(self, sample: Sample) -> Sample:
-        """
-        Encode a sample's label as a one-hot vector.
-
-        :param sample: The sample to encode.
-        :return: Sample with encoded label.
-        """
-        one_hot_label = [0] * self.n_classes
-        match sample["label"]:
-            case int():
-                one_hot_label[sample["label"]] = 1
-            case list():
-                for idx in sample["label"]:
-                    one_hot_label[idx] = 1
-        sample["label"] = one_hot_label
+            ohe_vector = [0] * self.n_classes
+            ohe_vector[sample["label"]] = 1
+            sample["label"] = ohe_vector
         return sample
-
-    def _create_oos_split(self) -> HFDataset | None:
-        """
-        Create an out-of-scope (OOS) split from the dataset.
-
-        :return: The OOS split if created, None otherwise.
-        """
-        oos_splits = [split.filter(self._is_oos) for split in self.values()]
-        oos_splits = [oos_split for oos_split in oos_splits if oos_split.num_rows]
-        if oos_splits:
-            for split_name, split in self.items():
-                self[split_name] = split.filter(lambda sample: not self._is_oos(sample))
-            return concatenate_datasets(oos_splits)
-        return None
-
-    def _cast_label_feature(self) -> None:
-        """Cast the label feature of the dataset to the appropriate type."""
-        for split_name, split in self.items():
-            new_features = split.features.copy()
-            if self.multilabel:
-                new_features[self.label_feature] = Sequence(
-                    ClassLabel(num_classes=self.n_classes),
-                )
-            else:
-                new_features[self.label_feature] = ClassLabel(
-                    num_classes=self.n_classes,
-                )
-            self[split_name] = split.cast(new_features)
@@ -66,18 +66,9 @@ def validate_dataset(self) -> "DatasetReader":
         ]
         splits = [split for split in splits if split]
 
-        n_classes = [self._get_n_classes(split) for split in splits]
-        if len(set(n_classes)) != 1:
-            message = (
-                f"Mismatch in number of classes across splits. Found class counts: {n_classes}. "
-                "Ensure all splits have the same number of classes."
-            )
-            raise ValueError(message)
-        if not n_classes[0]:
-            message = "Number of classes is zero or undefined. " "Ensure at least one class is present in the splits."
-            raise ValueError(message)
+        n_classes = self._validate_classes(splits)
 
-        self._validate_intents(n_classes[0])
+        self._validate_intents(n_classes)
 
         for split in splits:
             self._validate_split(split)
@@ -100,6 +91,20 @@ def _get_n_classes(self, split: list[Sample]) -> int:
                         classes.add(label)
         return len(classes)
 
+    def _validate_classes(self, splits: list[list[Sample]]) -> int:
+        """Validate that each split has all classes."""
+        n_classes = [self._get_n_classes(split) for split in splits]
+        if len(set(n_classes)) != 1:
+            message = (
+                f"Mismatch in number of classes across splits. Found class counts: {n_classes}. "
+                "Ensure all splits have the same number of classes."
+            )
+            raise ValueError(message)
+        if not n_classes[0]:
+            message = "Number of classes is zero or undefined. " "Ensure at least one class is present in the splits."
+            raise ValueError(message)
+        return n_classes[0]
+
     def _validate_intents(self, n_classes: int) -> "DatasetReader":
         """
         Validate the intents by checking their IDs for sequential order.
@@ -132,7 +137,8 @@ def _validate_split(self, split: list[Sample]) -> "DatasetReader":
         intent_ids = {intent.id for intent in self.intents}
         for sample in split:
             message = (
-                f"Sample with label {sample.label} references a non-existent intent ID. " f"Valid IDs are {intent_ids}."
+                f"Sample with label {sample.label} and utterance {sample.utterance[:10]}... "
+                f"references a non-existent intent ID. Valid IDs are {intent_ids}."
             )
             if isinstance(sample.label, int) and sample.label not in intent_ids:
                 raise ValueError(message)