Fix/documentation (#248)

voorhs · web-flow · commit 53806e6d71bb · 2025-07-20T17:02:30.000+03:00
* make preparations

* fix catboost

* fix doctests for bert-based methods

* minor fixes

* upd readme

* fix tutorials

* upd quickstart page

* run formatter

* fix typing
diff --git a/README.md b/README.md
@@ -34,3 +34,7 @@ pipeline = Pipeline.from_preset("light")
 pipeline.fit(dataset)
 pipeline.predict(["show me my latest transactions"])
 ```
+
+## Disclaimer
+
+This project is in an development phase. Bugs and breaking changes are expected. Contributions and feedback are welcome! See [CONTRIBUTING.md](./CONTRIBUTING.md).
diff --git a/autointent/modules/scoring/_bert.py b/autointent/modules/scoring/_bert.py
@@ -2,7 +2,7 @@
 
 import tempfile
 from collections.abc import Callable
-from typing import Any
+from typing import Any, Literal
 
 import numpy as np
 import numpy.typing as npt
@@ -15,6 +15,8 @@
     DataCollatorWithPadding,
     EarlyStoppingCallback,
     EvalPrediction,
+    PrinterCallback,
+    ProgressCallback,
     Trainer,
     TrainingArguments,
 )
@@ -84,8 +86,9 @@ def __init__(
         batch_size: int = 8,
         learning_rate: float = 5e-5,
         seed: int = 0,
-        report_to: REPORTERS_NAMES | None = None,  # type: ignore  # noqa: PGH003
+        report_to: REPORTERS_NAMES | Literal["none"] = "none",  # type: ignore  # noqa: PGH003
         early_stopping_config: EarlyStoppingConfig | dict[str, Any] | None = None,
+        print_progress: bool = False,
     ) -> None:
         self.classification_model_config = HFModelConfig.from_search_config(classification_model_config)
         self.num_train_epochs = num_train_epochs
@@ -94,6 +97,7 @@ def __init__(
         self.seed = seed
         self.report_to = report_to
         self.early_stopping_config = EarlyStoppingConfig.from_search_config(early_stopping_config)
+        self.print_progress = print_progress
 
     @classmethod
     def from_context(
@@ -187,6 +191,9 @@ def _train(self, tokenized_dataset: DatasetDict) -> None:
                 compute_metrics=self._get_compute_metrics(),
                 callbacks=self._get_trainer_callbacks(),
             )
+            if not self.print_progress:
+                trainer.remove_callback(PrinterCallback)  # type: ignore[attr-defined]
+                trainer.remove_callback(ProgressCallback)  # type: ignore[attr-defined]
 
             trainer.train()  # type: ignore[attr-defined]
 
diff --git a/autointent/modules/scoring/_catboost/catboost_scorer.py b/autointent/modules/scoring/_catboost/catboost_scorer.py
@@ -58,7 +58,7 @@ class CatBoostScorer(BaseScorer):
                 `catboost's documentation <https://catboost.ai/docs/en/concepts/python-reference_catboostclassifier>`_
 
     Example:
-    -------
+    --------
 
     .. testcode::
 
@@ -79,12 +79,6 @@ class CatBoostScorer(BaseScorer):
         scorer.fit(utterances, labels)
         test_utterances = ["hi", "bye"]
         probabilities = scorer.predict(test_utterances)
-        print(probabilities)
-
-    .. testoutput::
-
-        [[0.41493207 0.58506793]
-         [0.55036046 0.44963954]]
 
     """
 
diff --git a/autointent/modules/scoring/_lora/lora.py b/autointent/modules/scoring/_lora/lora.py
@@ -1,7 +1,7 @@
 """BertScorer class for transformer-based classification with LoRA."""
 
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal
 
 from peft import LoraConfig, get_peft_model
 
@@ -51,12 +51,7 @@ class BERTLoRAScorer(BertScorer):
         # Make predictions
         test_utterances = ["Good product", "Not worth it"]
         probabilities = scorer.predict(test_utterances)
-        print(probabilities)
 
-    .. testoutput::
-
-        [[0.89 0.11]
-        [0.23 0.77]]
     """
 
     name = "lora"
@@ -68,7 +63,8 @@ def __init__(
         batch_size: int = 8,
         learning_rate: float = 5e-5,
         seed: int = 0,
-        report_to: REPORTERS_NAMES | None = None,  # type: ignore[valid-type]
+        report_to: REPORTERS_NAMES | Literal["none"] = "none",  # type: ignore  # noqa: PGH003
+        print_progress: bool = False,
         **lora_kwargs: Any,  # noqa: ANN401
     ) -> None:
         # early stopping doesnt work with lora for now https://github.com/huggingface/transformers/issues/38130
@@ -82,6 +78,7 @@ def __init__(
             seed=seed,
             report_to=report_to,
             early_stopping_config=early_stopping_config,
+            print_progress=print_progress,
         )
         self._lora_config = LoraConfig(**lora_kwargs)
 
diff --git a/autointent/modules/scoring/_ptuning/ptuning.py b/autointent/modules/scoring/_ptuning/ptuning.py
@@ -34,7 +34,6 @@ class PTuningScorer(BertScorer):
             classification_model_config="prajjwal1/bert-tiny",
             num_train_epochs=3,
             batch_size=8,
-            task_type="SEQ_CLS",
             num_virtual_tokens=10,
             seed=42
         )
@@ -43,12 +42,6 @@ class PTuningScorer(BertScorer):
         scorer.fit(utterances, labels)
         test_utterances = ["hi", "bye"]
         probabilities = scorer.predict(test_utterances)
-        print(probabilities)
-
-    .. testoutput::
-
-        [[0.49925193 0.50074804]
-        [0.4944601  0.5055399 ]]
 
     """
 
@@ -61,13 +54,14 @@ def __init__(  # noqa: PLR0913
         batch_size: PositiveInt = 8,
         learning_rate: float = 5e-5,
         seed: int = 0,
-        report_to: REPORTERS_NAMES | None = None,  # type: ignore[valid-type]
+        report_to: REPORTERS_NAMES | Literal["none"] = "none",  # type: ignore  # noqa: PGH003
         encoder_reparameterization_type: Literal["MLP", "LSTM"] = "LSTM",
         num_virtual_tokens: PositiveInt = 10,
         encoder_dropout: float = 0.1,
         encoder_hidden_size: PositiveInt = 128,
         encoder_num_layers: PositiveInt = 2,
         early_stopping_config: EarlyStoppingConfig | None = None,
+        print_progress: bool = False,
         **ptuning_kwargs: Any,  # noqa: ANN401
     ) -> None:
         super().__init__(
@@ -78,6 +72,7 @@ def __init__(  # noqa: PLR0913
             seed=seed,
             report_to=report_to,
             early_stopping_config=early_stopping_config,
+            print_progress=print_progress,
         )
         self._ptuning_config = PromptEncoderConfig(
             task_type=TaskType.SEQ_CLS,
diff --git a/docs/_static/versions.json b/docs/_static/versions.json
@@ -1,10 +1,14 @@
 [
     {
-        "name": "v0.0.1 (stable)",
-        "version": "v0.0.1",
-        "url": "https://deeppavlov.github.io/AutoIntent/versions/v0.0.1/",
+        "name": "v0.1.0 (stable)",
+        "version": "v0.1.0",
+        "url": "https://deeppavlov.github.io/AutoIntent/versions/v0.1.0/",
         "preferred": true
     },
+    {
+        "version": "v0.0.1",
+        "url": "https://deeppavlov.github.io/AutoIntent/versions/v0.0.1/"
+    },
     {
         "version": "dev (dev)",
         "url": "https://deeppavlov.github.io/AutoIntent/versions/dev/"
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -24,7 +24,7 @@
 project = "AutoIntent"
 copyright = "2025, DeepPavlov"
 author = "DeepPavlov"
-release = "0.1.0"
+release = "0.2.0"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -25,7 +25,7 @@ Example of building an intent classifier in a couple of lines of code:
    from autointent import Pipeline, Dataset
 
    dataset = Dataset.from_json(path_to_json)
-   pipeline = Pipeline.from_preset("light_extra")
+   pipeline = Pipeline.from_preset("classic-light")
    pipeline.fit(dataset)
    pipeline.predict(["show me my latest recent transactions"])
 
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -49,25 +49,20 @@ To load a dataset from the file system into Python, the :meth:`autointent.Datase
 AutoML goes brrr...
 -------------------
 
-Once the data is ready, you can start building the optimal classifier from the command line:
-
-.. code-block:: bash
-
-    autointent data.train_path="path/to/your/data.json"
-
-This command will start the hyperparameter search in the default :ref:`search space <key-search-space>`.
-
-As a result, a ``runs`` folder will be created in the current working directory, which will save the selected classifier ready for inference.
-
-Similar actions but in a limited mode can be started using the Python API:
+Once the data is ready, you can start building the optimal classifier:
 
 .. code-block:: python
 
     from autointent import PipelineOptimizer
 
-    pipeline_optimizer = PipelineOptimizer.default(multilabel=False)
+    pipeline_optimizer = PipelineOptimizer.from_preset("classic-light")
     pipeline_optimizer.fit(dataset)
 
+This code starts the hyperparameter search with preset :ref:`search space <key-search-space>`.
+
+As a result, ``runs`` folder will be created in the current working directory, which will save the selected classifier ready for inference.
+
+
 Inference
 ---------
 
diff --git a/docs/source/user_guides.rst b/docs/source/user_guides.rst
@@ -9,5 +9,4 @@ User Guides
 
    user_guides/index_basic_usage
    user_guides/index_advanced_usage
-   user_guides/index_cli_usage
    augmentation_tutorials/index
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "autointent"
-version = "0.1.0"
+version = "0.2.0"
 description = "A tool for automatically configuring a text classification pipeline for intent prediction."
 license = { text = "Apache 2.0" }
 authors = [
diff --git a/user_guides/advanced/02_automl.py b/user_guides/advanced/02_automl.py
@@ -131,7 +131,7 @@
 from autointent import Pipeline
 
 pipeline_optimizer = Pipeline.from_search_space(search_space)
-pipeline_optimizer.fit(dataset, sampler="random")
+pipeline_optimizer.fit(dataset)
 
 # %% [markdown]
 """
diff --git a/user_guides/basic_usage/03_automl.py b/user_guides/basic_usage/03_automl.py
@@ -31,7 +31,7 @@
 """
 
 # %%
-pipeline = Pipeline.from_preset("light_extra")
+pipeline = Pipeline.from_preset("classic-light")
 
 # %% [markdown]
 """
@@ -43,7 +43,7 @@
 
 from autointent.utils import load_preset
 
-preset = load_preset("light_extra")
+preset = load_preset("classic-light")
 pprint(preset)
 
 # %% [markdown]
@@ -52,7 +52,7 @@
 """
 
 # %%
-preset["search_space"][0]["search_space"][0]["k"] = [1, 3]
+preset["search_space"][0]["search_space"][0]["k"]["hight"] = 10
 custom_pipeline = Pipeline.from_optimization_config(preset)
 
 # %% [markdown]
@@ -84,7 +84,7 @@
 # %%
 from autointent.configs import EmbedderConfig, CrossEncoderConfig, TokenizerConfig
 
-custom_pipeline.set_config(EmbedderConfig(model_name="prajjwal1/bert-tiny", device="cpu"))
+custom_pipeline.set_config(EmbedderConfig(model_name="prajjwal1/bert-tiny"))
 custom_pipeline.set_config(
     CrossEncoderConfig(model_name="cross-encoder/ms-marco-MiniLM-L2-v2", tokenizer_config=TokenizerConfig(max_length=8))
 )
@@ -127,7 +127,7 @@
 dataset = Dataset.from_hub("DeepPavlov/clinc150_subset")
 
 # customize search space
-preset = load_preset("light_extra")
+preset = load_preset("classic-light")
 
 # make pipeline
 custom_pipeline = Pipeline.from_optimization_config(preset)

Original file line number	Diff line number	Diff line change
`@@ -1,10 +1,14 @@`
`1`	`1`	`[`
`2`	`2`	`{`
`3`		`- "name": "v0.0.1 (stable)",`
`4`		`- "version": "v0.0.1",`
`5`		`- "url": "https://deeppavlov.github.io/AutoIntent/versions/v0.0.1/",`
	`3`	`+ "name": "v0.1.0 (stable)",`
	`4`	`+ "version": "v0.1.0",`
	`5`	`+ "url": "https://deeppavlov.github.io/AutoIntent/versions/v0.1.0/",`
`6`	`6`	`"preferred": true`
`7`	`7`	`},`
	`8`	`+ {`
	`9`	`+ "version": "v0.0.1",`
	`10`	`+ "url": "https://deeppavlov.github.io/AutoIntent/versions/v0.0.1/"`
	`11`	`+ },`
`8`	`12`	`{`
`9`	`13`	`"version": "dev (dev)",`
`10`	`14`	`"url": "https://deeppavlov.github.io/AutoIntent/versions/dev/"`