deeppavlov
diff --git a/‎autointent/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎autointent/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎autointent/_dataset/_dataset.py‎
Lines changed: 4 additions & 1 deletion b/‎autointent/_dataset/_dataset.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎autointent/context/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎autointent/context/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎autointent/context/_context.py‎
Lines changed: 2 additions & 2 deletions b/‎autointent/context/_context.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎autointent/context/_utils.py‎
Lines changed: 7 additions & 7 deletions b/‎autointent/context/_utils.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎autointent/generation/intents/__init__.py‎ b/‎autointent/generation/intents/__init__.py‎
diff --git a/‎…/generation/utterances/basic/generate.py‎ ‎…ntent/generation/utterances/basic/cli.py‎autointent/generation/utterances/basic/generate.py renamed to autointent/generation/utterances/basic/cli.py
Lines changed: 32 additions & 28 deletions b/‎…/generation/utterances/basic/generate.py‎ ‎…ntent/generation/utterances/basic/cli.py‎autointent/generation/utterances/basic/generate.py renamed to autointent/generation/utterances/basic/cli.py
Lines changed: 32 additions & 28 deletions
diff --git a/‎autointent/generation/utterances/basic/utterance_generator.py‎
Lines changed: 76 additions & 44 deletions b/‎autointent/generation/utterances/basic/utterance_generator.py‎
Lines changed: 76 additions & 44 deletions
diff --git a/‎autointent/generation/utterances/evolution/chat_templates/base_instruction.txt‎
Lines changed: 1 addition & 1 deletion b/‎autointent/generation/utterances/evolution/chat_templates/base_instruction.txt‎
Lines changed: 1 addition & 1 deletion
@@ -3,7 +3,7 @@
 from ._embedder import Embedder
 from ._dataset import Dataset
 from ._hash import Hasher
-from .context import Context
+from .context import Context, load_dataset
 from ._pipeline import Pipeline
 
-__all__ = ["Context", "Dataset", "Embedder", "Hasher", "Pipeline"]
+__all__ = ["Context", "Dataset", "Embedder", "Hasher", "Pipeline", "load_dataset"]
@@ -144,7 +144,10 @@ def to_json(self, filepath: str | Path) -> None:
 
         :param filepath: The path to the file where the JSON data will be saved.
         """
-        with Path(filepath).open("w") as file:
+        path = Path(filepath)
+        if not path.parent.exists():
+            path.parent.mkdir(parents=True)
+        with path.open("w") as file:
             json.dump(self.to_dict(), file, indent=4, ensure_ascii=False)
 
     def push_to_hub(self, repo_id: str, private: bool = False) -> None:
 
@@ -1,5 +1,6 @@
 """Core utilities for auto ML features."""
 
 from ._context import Context
+from ._utils import load_dataset
 
-__all__ = ["Context"]
+__all__ = ["Context", "load_dataset"]
@@ -16,7 +16,7 @@
     VectorIndexConfig,
 )
 
-from ._utils import NumpyEncoder, load_data
+from ._utils import NumpyEncoder, load_dataset
 from .data_handler import DataHandler
 from .optimization_info import OptimizationInfo
 from .vector_index_client import VectorIndexClient
@@ -81,7 +81,7 @@ def configure_data(self, config: DataConfig) -> None:
         :param config: Configuration for the data handling process.
         """
         self.data_handler = DataHandler(
-            dataset=load_data(config.train_path),
+            dataset=load_dataset(config.train_path),
             random_seed=self.seed,
             force_multilabel=config.force_multilabel,
         )
 
@@ -40,9 +40,9 @@ def default(self, obj: Any) -> str | int | float | list[Any] | Any:  # noqa: ANN
         return super().default(obj)
 
 
-def load_data(filepath: str | Path) -> Dataset:
+def load_dataset(path: str | Path) -> Dataset:
     """
-    Load data from a specified path or use default sample data.
+    Load data from a specified path or use default sample data or load from hugging face hub.
 
     This function loads a dataset from a JSON file or retrieves sample data
     included with the `autointent` package for default multiclass or multilabel
@@ -53,10 +53,10 @@ def load_data(filepath: str | Path) -> Dataset:
                       - "default-multilabel": Loads sample multilabel dataset.
     :return: A `Dataset` object containing the loaded data.
     """
-    if filepath == "default-multiclass":
+    if path == "default-multiclass":
         return Dataset.from_hub("AutoIntent/clinc150_subset")
-    if filepath == "default-multilabel":
+    if path == "default-multilabel":
         return Dataset.from_hub("AutoIntent/clinc150_subset").to_multilabel()
-    if not Path(filepath).exists():
-        return Dataset.from_hub(str(filepath))
-    return Dataset.from_json(filepath)
+    if not Path(path).exists():
+        return Dataset.from_hub(str(path))
+    return Dataset.from_json(path)
@@ -1,45 +1,46 @@
-import json
-import os
+"""CLI for basic utterance generator."""
+
 from argparse import ArgumentParser
-from typing import Any
 
+from autointent import load_dataset
 from autointent.generation.utterances.basic.utterance_generator import LengthType, StyleType, UtteranceGenerator
 from autointent.generation.utterances.generator import Generator
 
 
-def read_json_dataset(file_path: os.PathLike):
-    with open(file_path) as file:
-        return json.load(file)
-
-
-def save_json_dataset(file_path: os.PathLike, intents: list[dict[str, Any]]):
-    dirname = os.path.dirname(file_path)
-    if not os.path.exists(dirname):
-        os.makedirs(dirname)
-    with open(file_path, "w") as file:
-        json.dump(intents, file, indent=4, ensure_ascii=False)
-
-
-def main():
+def main() -> None:
+    """ClI endpoint."""
     parser = ArgumentParser()
     parser.add_argument(
         "--input-path",
         type=str,
         required=True,
-        help="Path to json with intent records",
+        help="Path to json or hugging face repo with dataset",
     )
     parser.add_argument(
         "--output-path",
         type=str,
         required=True,
-        help="Where to save result",
+        help="Local path where to save result",
     )
     parser.add_argument(
-        "--n-shots",
+        "--output-repo",
+        type=str,
+        default=None,
+        help="Local path where to save result",
+    )
+    parser.add_argument("--private", action="store_true", help="Publish privately if --output-repo option is used")
+    parser.add_argument(
+        "--n-generations",
         type=int,
-        required=True,
+        default=5,
         help="Number of utterances to generate for each intent",
     )
+    parser.add_argument(
+        "--n-sample-utterances",
+        type=int,
+        default=5,
+        help="Number of utterances to use as an example for augmentation",
+    )
     parser.add_argument(
         "--custom-instruction",
         type=str,
@@ -49,13 +50,13 @@ def main():
     )
     parser.add_argument(
         "--length",
-        choices=LengthType.__args__,
+        choices=LengthType.__args__,  # type: ignore[attr-defined]
         default="none",
         help="How to extend the prompt with length instruction",
     )
     parser.add_argument(
         "--style",
-        choices=StyleType.__args__,
+        choices=StyleType.__args__,  # type: ignore[attr-defined]
         default="none",
         help="How to extend the prompt with style instruction",
     )
@@ -66,13 +67,16 @@ def main():
     )
     args = parser.parse_args()
 
-    intents = read_json_dataset(args.input_path)
+    dataset = load_dataset(args.input_path)
+    generator = UtteranceGenerator(
+        Generator(), args.custom_instruction or [], args.length, args.style, args.same_punctuation
+    )
+    generator.augment(dataset, n_generations=args.n_generations, max_sample_utterances=args.n_sample_utterances)
 
-    generator = UtteranceGenerator(Generator(), args.custom_instruction, args.length, args.style, args.same_punctuation)
-    for intent_record in intents:
-        generator(intent_record, args.n_shots, inplace=True)
+    dataset.to_json(args.output_path)
 
-    save_json_dataset(args.output_path, intents)
+    if args.output_repo is not None:
+        dataset.push_to_hub(args.output_repo, private=args.private)
 
 
 if __name__ == "__main__":
 
@@ -1,73 +1,116 @@
+"""Basic generation of new utterances from existing ones."""
+
 import importlib.resources as ires
 import json
+import random
 from typing import Any, Literal
 
 import yaml
+from datasets import Dataset as HFDataset
+from datasets import concatenate_datasets
 
+from autointent import Dataset
+from autointent.custom_types import Split
 from autointent.generation.utterances.generator import Generator
-from autointent.generation.utterances.utils import safe_format
+from autointent.generation.utterances.utils import safe_format  # type: ignore[attr-defined]
+from autointent.schemas import Sample
 
 LengthType = Literal["none", "same", "longer", "shorter"]
 StyleType = Literal["none", "formal", "informal", "playful"]
 
 
 class UtteranceGenerator:
+    """
+    Basic generation of new utterances from existing ones.
+
+    This augmentation method simply prompts LLM to look at existing examples
+    and generate similar. Additionaly it can consider some aspects of style,
+    punctuation and length of the desired generations.
+    """
+
     def __init__(
         self,
         generator: Generator,
         custom_instruction: list[str],
         length: LengthType,
         style: StyleType,
         same_punctuation: bool,
-    ):
+    ) -> None:
+        """Initialize."""
         self.generator = generator
-        prompt_template_yaml = load_prompt()
-        self.prompt_template_yaml = add_extra_instructions(
+        prompt_template_yaml = _load_prompt()
+        self.prompt_template_yaml = _add_extra_instructions(
             prompt_template_yaml,
             custom_instruction,
             length,
             style,
             same_punctuation,
         )
 
-    def _generate(self, intent_name: str, example_utterances: list[str], n_examples: int) -> list[str]:
+    def __call__(self, intent_name: str, example_utterances: list[str], n_generations: int) -> list[str]:
+        """Generate new utterances."""
         messages_yaml = safe_format(
             self.prompt_template_yaml,
             intent_name=intent_name,
-            example_utterances=format_utterances(example_utterances),
-            n_examples=n_examples,
+            example_utterances=_format_utterances(example_utterances),
+            n_examples=n_generations,
         )
         messages = yaml.safe_load(messages_yaml)
         response_text = self.generator.get_chat_completion(messages)
-        return extract_utterances(response_text)
+        return _extract_utterances(response_text)
 
-    def __call__(self, intent_record: dict[str, Any], n_examples: int, inplace: bool = True) -> list[str]:
-        intent_name = intent_record.get("intent_name", "")
-        example_utterances = intent_record.get("sample_utterances", [])
-        res_utterances = self._generate(intent_name, example_utterances, n_examples)
-        if inplace:
-            intent_record["sample_utterances"] = intent_record.get("sample_utterances", []) + res_utterances
-        return res_utterances
-
-
-def load_prompt():
-    with ires.files("autointent.generation.basic").joinpath("chat_template.yaml").open() as file:
+    def augment(
+        self,
+        dataset: Dataset,
+        split_name: str = Split.TRAIN,
+        n_generations: int = 5,
+        max_sample_utterances: int = 5,
+        update_split: bool = True,
+    ) -> list[Sample]:
+        """
+        Augment some split of dataset.
+
+        Note that for now it supports only single-label datasets.
+        """
+        original_split = dataset[split_name]
+        new_samples = []
+        for intent in dataset.intents:
+            filtered_split = original_split.filter(lambda sample, id=intent.id: sample[Dataset.label_feature] == id)
+            sample_utterances = filtered_split[Dataset.utterance_feature]
+            if max_sample_utterances is not None:
+                sample_utterances = random.sample(sample_utterances, k=max_sample_utterances)
+            generated_utterances = self(
+                intent_name=intent.name or "",
+                example_utterances=sample_utterances,
+                n_generations=n_generations,
+            )
+            new_samples.extend(
+                [{Dataset.label_feature: intent.id, Dataset.utterance_feature: ut} for ut in generated_utterances]
+            )
+        if update_split:
+            generated_split = HFDataset.from_list(new_samples)
+            dataset[split_name] = concatenate_datasets([original_split, generated_split])
+        return [Sample(**sample) for sample in new_samples]
+
+
+def _load_prompt() -> str:
+    with ires.files("autointent.generation.utterances.basic").joinpath("chat_template.yaml").open() as file:
         return file.read()
 
 
-def load_extra_instructions():
-    with ires.files("autointent.generation.basic").joinpath("extra_instructions.json").open() as file:
-        return json.load(file)
+def _load_extra_instructions() -> dict[str, Any]:
+    with ires.files("autointent.generation.utterances.basic").joinpath("extra_instructions.json").open() as file:
+        return json.load(file)  # type: ignore[no-any-return]
 
 
-def add_extra_instructions(
+def _add_extra_instructions(
     prompt_template_yaml: str,
     custom_instruction: list[str],
     length: LengthType,
     style: StyleType,
     same_punctuation: bool,
 ) -> str:
-    instructions = load_extra_instructions()
+    instructions = _load_extra_instructions()
 
     extra_instructions = []
     if length != "none":
@@ -80,40 +123,29 @@ def add_extra_instructions(
     extra_instructions.extend(custom_instruction)
 
     parsed_extra_instructions = "\n    ".join([f"- {s}" for s in extra_instructions])
-    return safe_format(prompt_template_yaml, extra_instructions=parsed_extra_instructions)
+    return safe_format(prompt_template_yaml, extra_instructions=parsed_extra_instructions)  # type: ignore[no-any-return]
 
 
-def format_utterances(utterances: list[str]) -> str:
+def _format_utterances(utterances: list[str]) -> str:
     """
-    Return
-    ---
-    str of the following format:
+    Convert given utterances into string that is ready to insert into prompt.
 
-    ```
+    Given list of utterances, the output string is returned in the following format:
+    .. code-block::
         1. I want to order a large pepperoni pizza.
         2. Can I get a medium cheese pizza with extra olives?
         3. Please deliver a small veggie pizza to my address.
-    ```
 
-    Note
-    ---
-    tab is inserted before each line because of how yaml processes multi-line fields
+    Note that tab is inserted before each line because of how yaml processes multi-line fields.
     """
     return "\n    ".join(f"{i}. {ut}" for i, ut in enumerate(utterances))
 
 
-def extract_utterances(response_text: str) -> list[str]:
+def _extract_utterances(response_text: str) -> list[str]:
     """
-    Input
-    ---
-    str of the following format:
-
-    ```
-    1. I want to order a large pepperoni pizza.
-    2. Can I get a medium cheese pizza with extra olives?
-    3. Please deliver a small veggie pizza to my address.
-    ```
+    Parse LLM output.
 
+    Inverse function to :py:func:`_format_utterances`.
     """
     raw_utterances = response_text.split("\n")
     # remove enumeration
 
@@ -1,3 +1,3 @@
 I want you to act as a rewriter.
     You will be provided with an utterance and the topic (name of intent class) of the utterance.
-    You MUST complicate the utterance using the following method:
+    You need to complicate the utterance using the following method: