deeppavlov
diff --git a/‎.github/workflows/build-docs.yaml‎
Lines changed: 10 additions & 2 deletions b/‎.github/workflows/build-docs.yaml‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎Makefile‎
Lines changed: 4 additions & 0 deletions b/‎Makefile‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎autointent/_embedder.py‎
Lines changed: 2 additions & 1 deletion b/‎autointent/_embedder.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎autointent/configs/_optimization_cli.py‎
Lines changed: 1 addition & 3 deletions b/‎autointent/configs/_optimization_cli.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎autointent/context/_context.py‎
Lines changed: 16 additions & 32 deletions b/‎autointent/context/_context.py‎
Lines changed: 16 additions & 32 deletions
diff --git a/‎autointent/context/_utils.py‎
Lines changed: 10 additions & 12 deletions b/‎autointent/context/_utils.py‎
Lines changed: 10 additions & 12 deletions
diff --git a/‎autointent/context/data_handler/__init__.py‎
Lines changed: 4 additions & 4 deletions b/‎autointent/context/data_handler/__init__.py‎
Lines changed: 4 additions & 4 deletions
@@ -4,6 +4,9 @@ on:
   push:
     branches:
     - dev
+  pull_request:
+    branches:
+      - dev
   workflow_dispatch:
 
 concurrency:
@@ -37,6 +40,10 @@ jobs:
         run: |
           poetry install --with docs
 
+      - name: Test documentation
+        run: |
+          make test-docs
+
       - name: build documentation
         run: |
           make docs
@@ -49,15 +56,16 @@ jobs:
           BRANCH_NAME=${BRANCH_NAME////_}
           echo BRANCH_NAME=${BRANCH_NAME} >> $GITHUB_ENV
 
-      - name: save artifact
+      - name: Upload artifact
         uses: actions/upload-artifact@v4
         with:
           name: ${{ format('github-pages-for-branch-{0}', env.BRANCH_NAME) }}
           path: docs/build/
           retention-days: 3
 
-      - name: deploy website
+      - name: Deploy to GitHub Pages
         uses: JamesIves/[email protected]
+        if: ${{ github.ref == 'refs/heads/dev' }}
         with:
           branch: gh-pages
           folder: docs/build/html/
 
@@ -28,6 +28,10 @@ sync:
 
 .PHONY: docs
 docs:
+	$(poetry) python -m sphinx build -b html docs/source docs/build/html
+
+.PHONY: test-docs
+test-docs: docs
 	$(poetry) python -m sphinx build -b doctest docs/source docs/build/html
 
 .PHONY: serve-docs
 
@@ -72,7 +72,8 @@ def delete(self) -> None:
         """Delete the embedding model and its associated directory."""
         self.clear_ram()
         shutil.rmtree(
-            self.dump_dir, ignore_errors=True
+            self.dump_dir,
+            ignore_errors=True,
         )  # TODO: `ignore_errors=True` is workaround for PermissionError: [WinError 5] Access is denied
 
     def dump(self, path: Path) -> None:
 
@@ -123,8 +123,6 @@ class OptimizationConfig:
     """Configuration for the logging"""
     vector_index: VectorIndexConfig = field(default_factory=VectorIndexConfig)
     """Configuration for the vector index"""
-    augmentation: AugmentationConfig = field(default_factory=AugmentationConfig)
-    """Configuration for the augmentation"""
     embedder: EmbedderConfig = field(default_factory=EmbedderConfig)
     """Configuration for the embedder"""
 
@@ -133,7 +131,7 @@ class OptimizationConfig:
             "_self_",
             {"override hydra/job_logging": "autointent_standard_job_logger"},
             {"override hydra/help": "autointent_help"},
-        ]
+        ],
     )
 
 
 
@@ -9,15 +9,14 @@
 import yaml
 
 from autointent.configs import (
-    AugmentationConfig,
     DataConfig,
     EmbedderConfig,
     LoggingConfig,
     VectorIndexConfig,
 )
 
 from ._utils import NumpyEncoder, load_data
-from .data_handler import DataAugmenter, DataHandler, Dataset
+from .data_handler import DataHandler, Dataset
 from .optimization_info import OptimizationInfo
 from .vector_index_client import VectorIndex, VectorIndexClient
 
@@ -71,43 +70,29 @@ def configure_vector_index(self, config: VectorIndexConfig, embedder_config: Emb
             self.embedder_config.max_length,
         )
 
-    def configure_data(self, config: DataConfig, augmentation_config: AugmentationConfig | None = None) -> None:
+    def configure_data(self, config: DataConfig) -> None:
         """
-        Configure data handling and augmentation.
+        Configure data handling.
 
         :param config: Configuration for the data handling process.
-        :param augmentation_config: Configuration for data augmentation. If None, no augmentation is applied.
-        """
-        if augmentation_config is not None:
-            self.augmentation_config = AugmentationConfig()
-            augmenter = DataAugmenter(
-                self.augmentation_config.multilabel_generation_config,
-                self.augmentation_config.regex_sampling,
-                self.seed,
-            )
-        else:
-            augmenter = None
-
+        """
         self.data_handler = DataHandler(
             dataset=load_data(config.train_path),
-            test_dataset=None if config.test_path is None else load_data(config.test_path),
             random_seed=self.seed,
             force_multilabel=config.force_multilabel,
-            augmenter=augmenter,
         )
 
-    def set_datasets(
-        self, train_data: Dataset, val_data: Dataset | None = None, force_multilabel: bool = False
-    ) -> None:
+    def set_dataset(self, dataset: Dataset, force_multilabel: bool = False) -> None:
         """
-        Set the datasets for training and validation.
+        Set the datasets for training, validation and testing.
 
-        :param train_data: Training dataset.
-        :param val_data: Validation dataset. If None, only training data is used.
+        :param dataset: Dataset.
         :param force_multilabel: Whether to force multilabel classification.
         """
         self.data_handler = DataHandler(
-            dataset=train_data, test_dataset=val_data, random_seed=self.seed, force_multilabel=force_multilabel
+            dataset=dataset,
+            force_multilabel=force_multilabel,
+            random_seed=self.seed,
         )
 
     def get_best_index(self) -> VectorIndex:
@@ -159,13 +144,12 @@ def dump(self) -> None:
         with logs_path.open("w") as file:
             json.dump(optimization_results, file, indent=4, ensure_ascii=False, cls=NumpyEncoder)
 
-        train_data, test_data = self.data_handler.dump()
-        train_path = logs_dir / "train_data.json"
-        test_path = logs_dir / "test_data.json"
-        with train_path.open("w") as file:
-            json.dump(train_data, file, indent=4, ensure_ascii=False)
-        with test_path.open("w") as file:
-            json.dump(test_data, file, indent=4, ensure_ascii=False)
+        # self._logger.info(make_report(optimization_results, nodes=nodes))
+
+        # dump train and test data splits
+        dataset_path = logs_dir / "dataset.json"
+        with dataset_path.open("w") as file:
+            json.dump(self.data_handler.dump(), file, indent=4, ensure_ascii=False)
 
         self._logger.info("logs and other assets are saved to %s", logs_dir)
 
 
@@ -41,7 +41,7 @@ def default(self, obj: Any) -> str | int | float | list[Any] | Any:  # noqa: ANN
         return super().default(obj)
 
 
-def load_data(data_path: str | Path) -> Dataset:
+def load_data(filepath: str | Path) -> Dataset:
     """
     Load data from a specified path or use default sample data.
 
@@ -54,14 +54,12 @@ def load_data(data_path: str | Path) -> Dataset:
                       - "default-multilabel": Loads sample multilabel dataset.
     :return: A `Dataset` object containing the loaded data.
     """
-    if data_path == "default-multiclass":
-        with ires.files("autointent.datafiles").joinpath("banking77.json").open() as file:
-            res = json.load(file)
-    elif data_path == "default-multilabel":
-        with ires.files("autointent.datafiles").joinpath("dstc3-20shot.json").open() as file:
-            res = json.load(file)
-    else:
-        with Path(data_path).open() as file:
-            res = json.load(file)
-
-    return Dataset.model_validate(res)
+    if filepath == "default-multiclass":
+        return Dataset.from_json(
+            ires.files("autointent.datafiles").joinpath("banking77.json"),  # type: ignore[arg-type]
+        )
+    if filepath == "default-multilabel":
+        return Dataset.from_json(
+            ires.files("autointent.datafiles").joinpath("dstc3-20shot.json"),  # type: ignore[arg-type]
+        )
+    return Dataset.from_json(filepath)
@@ -1,5 +1,5 @@
-from ._data_handler import DataAugmenter, DataHandler
-from ._schemas import Dataset
-from ._tags import Tag
+from ._data_handler import DataHandler
+from ._dataset import Dataset
+from ._schemas import Intent, Sample, Tag
 
-__all__ = ["DataAugmenter", "DataHandler", "Dataset", "Tag"]
+__all__ = ["DataHandler", "Dataset", "Intent", "Sample", "Tag"]