deeppavlov
diff --git a/‎autointent/_dataset/_dataset.py‎
Lines changed: 7 additions & 6 deletions b/‎autointent/_dataset/_dataset.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎autointent/generation/utterances/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎autointent/generation/utterances/__init__.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎autointent/generation/utterances/basic/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎autointent/generation/utterances/basic/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎autointent/generation/utterances/basic/chat_template.py‎
Lines changed: 134 additions & 0 deletions b/‎autointent/generation/utterances/basic/chat_template.py‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎autointent/generation/utterances/basic/chat_template.yaml‎
Lines changed: 0 additions & 119 deletions b/‎autointent/generation/utterances/basic/chat_template.yaml‎
Lines changed: 0 additions & 119 deletions
diff --git a/‎autointent/generation/utterances/basic/cli.py‎
Lines changed: 18 additions & 29 deletions b/‎autointent/generation/utterances/basic/cli.py‎
Lines changed: 18 additions & 29 deletions
diff --git a/‎autointent/generation/utterances/basic/extra_instructions.json‎
Lines changed: 0 additions & 14 deletions b/‎autointent/generation/utterances/basic/extra_instructions.json‎
Lines changed: 0 additions & 14 deletions
@@ -100,13 +100,14 @@ def from_hub(cls, repo_id: str) -> "Dataset":
         :param repo_id: ID of the Hugging Face repository.
         :return: Initialized Dataset object.
         """
-        splits, intents = load_dataset(repo_id), []
+        from ._reader import DictReader
+
+        splits = load_dataset(repo_id)
+        mapping = dict(**splits)
         if Split.INTENTS in get_dataset_config_names(repo_id):
-            intents = load_dataset(repo_id, Split.INTENTS)[Split.INTENTS].to_list()
-        return cls(
-            splits.items(),
-            intents=[Intent.model_validate(intent) for intent in intents],
-        )
+            mapping["intents"] = load_dataset(repo_id, Split.INTENTS)[Split.INTENTS].to_list()
+
+        return DictReader().read(mapping)
 
     def to_multilabel(self) -> "Dataset":
         """
 
@@ -0,0 +1,14 @@
+from .basic import SynthesizerChatTemplate, UtteranceGenerator
+from .evolution import AbstractEvolution, ConcreteEvolution, EvolutionChatTemplate, ReasoningEvolution, UtteranceEvolver
+from .generator import Generator
+
+__all__ = [
+    "AbstractEvolution",
+    "ConcreteEvolution",
+    "EvolutionChatTemplate",
+    "Generator",
+    "ReasoningEvolution",
+    "SynthesizerChatTemplate",
+    "UtteranceEvolver",
+    "UtteranceGenerator",
+]
@@ -0,0 +1,4 @@
+from .chat_template import SynthesizerChatTemplate
+from .utterance_generator import UtteranceGenerator
+
+__all__ = ["SynthesizerChatTemplate", "UtteranceGenerator"]
@@ -0,0 +1,134 @@
+"""Chat template for evolution augmentation via abstractization."""
+
+import random
+from abc import ABC, abstractmethod
+from copy import deepcopy
+from typing import ClassVar
+
+from autointent import Dataset
+from autointent.generation.utterances.schemas import Message, Role
+from autointent.schemas import Intent
+
+
+class BaseSynthesizer(ABC):
+    """Base class."""
+
+    @abstractmethod
+    def __call__(self, intent_data: Intent, n_examples: int) -> list[Message]:
+        """Generate examples for this intent."""
+
+
+class SynthesizerChatTemplate(BaseSynthesizer):
+    """Chat template for generating additional examples for a given intent class."""
+
+    __messages: ClassVar[list[Message]] = [
+        Message(
+            role=Role.USER,
+            content=(
+                "You will be provided with a set of example utterances and the name "
+                "of the common topic (intent name) of these utterances. "
+                "Your task is to generate more examples that fit within the same intent name.\n\n"
+                "Note:\n"
+                "- You can generate similar utterances with only slot values changed\n"
+                "- You can generate completely different utterance from the same intent name\n"
+                "- Intent name can be missed, then you should infer from example utterances only\n"
+                "- Example utterances can be missed, then you should infer from intent name only\n"
+                "{extra_instructions}\n\n"
+                "Intent name: ordering_pizza\n\n"
+                "Example Utterances:\n"
+                "1. I want to order a large pepperoni pizza.\n"
+                "2. Can I get a medium cheese pizza with extra olives?\n"
+                "3. Please deliver a small veggie pizza to my address.\n\n"
+                "Please generate 3 more examples for the provided intent name."
+            ),
+        ),
+        Message(
+            role=Role.ASSISTANT,
+            content=(
+                "1. I'd like to order a large margherita pizza.\n"
+                "2. Can you deliver a medium Hawaiian pizza with extra pineapple?\n"
+                "3. Please send a small BBQ chicken pizza to my home."
+            ),
+        ),
+        Message(
+            role=Role.USER,
+            content=(
+                "Intent name: booking a hotel\n\n"
+                "Example Utterances:\n"
+                "1. I need to book a room for two nights in New York.\n\n"
+                "Please generate 2 more examples for the provided intent name."
+            ),
+        ),
+        Message(
+            role=Role.ASSISTANT,
+            content=(
+                "1. Can you reserve a deluxe room for my trip to Tokyo?\n"
+                "2. I need to book a hotel room with a mountain view in Denver."
+            ),
+        ),
+        Message(
+            role=Role.USER,
+            content=(
+                "Intent name:\n\n"
+                "Example Utterances:\n"
+                "1. What is the weather like today?\n\n"
+                "Please generate 2 more examples for the provided intent class."
+            ),
+        ),
+        Message(
+            role=Role.ASSISTANT,
+            content=("1. Can you tell me the forecast for tomorrow?\n" "2. Is it going to rain this weekend?"),
+        ),
+        Message(
+            role=Role.USER,
+            content=(
+                "Intent name: Scheduling a Meeting\n\n"
+                "Example Utterances:\n\n"
+                "Please generate 3 more examples for the provided intent class."
+            ),
+        ),
+        Message(
+            role=Role.ASSISTANT,
+            content=(
+                "1. I need to schedule a meeting for next Tuesday.\n"
+                "2. Can you set up a conference call for tomorrow afternoon?\n"
+                "3. Please arrange a meeting with the marketing team next week."
+            ),
+        ),
+    ]
+
+    def __init__(
+        self,
+        dataset: Dataset,
+        split: str,
+        extra_instructions: str | None = None,
+        max_sample_utterances: int | None = None,
+    ) -> None:
+        """Initialize."""
+        if extra_instructions is None:
+            extra_instructions = ""
+
+        self._messages = deepcopy(self.__messages)
+
+        msg = self._messages[0]
+        msg["content"] = msg["content"].format(extra_instructions=extra_instructions)
+
+        self.dataset = dataset
+        self.split = split
+        self.max_sample_utterances = max_sample_utterances
+
+    def __call__(self, intent_data: Intent, n_examples: int) -> list[Message]:
+        """Generate additional examples for the provided intent class."""
+        filtered_split = self.dataset[self.split].filter(lambda sample: sample[Dataset.label_feature] == intent_data.id)
+        sample_utterances = filtered_split[Dataset.utterance_feature]
+        if self.max_sample_utterances is not None:
+            sample_utterances = random.sample(sample_utterances, k=self.max_sample_utterances)
+        return [
+            *self._messages,
+            Message(
+                role=Role.USER,
+                content=f"Intent name: {intent_data.name}\n\n"
+                f"Example Utterances:\n{sample_utterances}\n\n"
+                f"Please generate {n_examples} more examples for the provided intent class.\n",
+            ),
+        ]
@@ -1,11 +1,17 @@
 """CLI for basic utterance generator."""
 
+import logging
 from argparse import ArgumentParser
 
 from autointent import load_dataset
-from autointent.generation.utterances.basic.utterance_generator import LengthType, StyleType, UtteranceGenerator
+from autointent.generation.utterances.basic.utterance_generator import UtteranceGenerator
 from autointent.generation.utterances.generator import Generator
 
+from .chat_template import SynthesizerChatTemplate
+
+logging.basicConfig(level="INFO")
+logger = logging.getLogger(__name__)
+
 
 def main() -> None:
     """ClI endpoint."""
@@ -28,6 +34,7 @@ def main() -> None:
         default=None,
         help="Local path where to save result",
     )
+    parser.add_argument("--split", type=str, default="train")
     parser.add_argument("--private", action="store_true", help="Publish privately if --output-repo option is used")
     parser.add_argument(
         "--n-generations",
@@ -41,37 +48,19 @@ def main() -> None:
         default=5,
         help="Number of utterances to use as an example for augmentation",
     )
-    parser.add_argument(
-        "--custom-instruction",
-        type=str,
-        action="append",
-        help="Add extra instructions to default prompt."
-        "You can use this argument multiple times to add multiple instructions",
-    )
-    parser.add_argument(
-        "--length",
-        choices=LengthType.__args__,  # type: ignore[attr-defined]
-        default="none",
-        help="How to extend the prompt with length instruction",
-    )
-    parser.add_argument(
-        "--style",
-        choices=StyleType.__args__,  # type: ignore[attr-defined]
-        default="none",
-        help="How to extend the prompt with style instruction",
-    )
-    parser.add_argument(
-        "--same-punctuation",
-        action="store_true",
-        help="Whether to extend the prompt with punctuation instruction",
-    )
     args = parser.parse_args()
 
     dataset = load_dataset(args.input_path)
-    generator = UtteranceGenerator(
-        Generator(), args.custom_instruction or [], args.length, args.style, args.same_punctuation
-    )
-    generator.augment(dataset, n_generations=args.n_generations, max_sample_utterances=args.n_sample_utterances)
+    template = SynthesizerChatTemplate(dataset, args.split, max_sample_utterances=args.n_sample_utterances)
+    generator = UtteranceGenerator(Generator(), template)
+
+    n_before = len(dataset[args.split])
+    new_samples = generator.augment(dataset, split_name=args.split, n_generations=args.n_generations)
+    n_after = len(dataset[args.split])
+
+    logger.info("# samples before %s", n_before)
+    logger.info("# samples generated %s", len(new_samples))
+    logger.info("# samples after %s", n_after)
 
     dataset.to_json(args.output_path)