Skip to content
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion autointent/generation/intents/prompt_scheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pydantic import BaseModel, field_validator

from autointent.generation.utterances.prompts import PROMPT_DESCRIPTION
from autointent.generation.intents.prompts import PROMPT_DESCRIPTION


class PromptDescription(BaseModel):
Expand Down
32 changes: 23 additions & 9 deletions autointent/generation/utterances/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
from .basic import SynthesizerChatTemplate, UtteranceGenerator
from .evolution import AbstractEvolution, ConcreteEvolution, EvolutionChatTemplate, ReasoningEvolution, UtteranceEvolver
from .evolution import (
AbstractEvolution,
ConcreteEvolution,
EvolutionChatTemplate,
FormalEvolution,
FunnyEvolution,
GoofyEvolution,
InformalEvolution,
ReasoningEvolution,
UtteranceEvolver,
)
from .generator import Generator

__all__ = [
"AbstractEvolution",
"ConcreteEvolution",
"EvolutionChatTemplate",
"Generator",
"ReasoningEvolution",
"SynthesizerChatTemplate",
"UtteranceEvolver",
"UtteranceGenerator",
"AbstractEvolution",
"ConcreteEvolution",
"EvolutionChatTemplate",
"FormalEvolution",
"FunnyEvolution",
"Generator",
"GoofyEvolution",
"InformalEvolution",
"ReasoningEvolution",
"SynthesizerChatTemplate",
"UtteranceEvolver",
"UtteranceGenerator",
]
5 changes: 3 additions & 2 deletions autointent/generation/utterances/basic/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


def main() -> None:
"""ClI endpoint."""
"""CLI endpoint."""
parser = ArgumentParser()
parser.add_argument(
"--input-path",
Expand Down Expand Up @@ -48,11 +48,12 @@ def main() -> None:
default=5,
help="Number of utterances to use as an example for augmentation",
)
parser.add_argument("--async-mode", action="store_true", help="Enable asynchronous generation")
args = parser.parse_args()

dataset = load_dataset(args.input_path)
template = SynthesizerChatTemplate(dataset, args.split, max_sample_utterances=args.n_sample_utterances)
generator = UtteranceGenerator(Generator(), template)
generator = UtteranceGenerator(Generator(), template, async_mode=args.async_mode)

n_before = len(dataset[args.split])
new_samples = generator.augment(dataset, split_name=args.split, n_generations=args.n_generations)
Expand Down
76 changes: 68 additions & 8 deletions autointent/generation/utterances/basic/utterance_generator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Basic generation of new utterances from existing ones."""

import asyncio
from collections.abc import Callable

from datasets import Dataset as HFDataset
Expand All @@ -17,46 +18,105 @@ class UtteranceGenerator:
Basic generation of new utterances from existing ones.

This augmentation method simply prompts LLM to look at existing examples
and generate similar. Additionaly it can consider some aspects of style,
punctuation and length of the desired generations.
and generate similar. Additionally, it can consider some aspects of style,
punctuation, and length of the desired generations.
"""

def __init__(self, generator: Generator, prompt_maker: Callable[[Intent, int], list[Message]]) -> None:
def __init__(
self, generator: Generator, prompt_maker: Callable[[Intent, int], list[Message]], async_mode: bool = False
) -> None:
"""Initialize."""
self.generator = generator
self.prompt_maker = prompt_maker
self.async_mode = async_mode

def __call__(self, intent_data: Intent, n_generations: int) -> list[str]:
"""Generate new utterances."""
messages = self.prompt_maker(intent_data, n_generations)
response_text = self.generator.get_chat_completion(messages)
return _extract_utterances(response_text)

async def _call_async(self, intent_data: Intent, n_generations: int) -> list[str]:
"""Generate new utterances asynchronously."""
messages = self.prompt_maker(intent_data, n_generations)
response_text = await self.generator.get_chat_completion_async(messages)
return _extract_utterances(response_text)

def augment(
self,
dataset: Dataset,
split_name: str = Split.TRAIN,
n_generations: int = 5,
update_split: bool = True,
batch_size: int = 4,
) -> list[Sample]:
"""
Augment some split of dataset.

TODO Note that for now it supports only single-label datasets.
:param dataset: Dataset object
:param split_name: Dataset split (default is TRAIN)
:param n_generations: Number of utterances to generate per intent
:param update_split: Whether to update the dataset split
:param batch_size: Batch size for async generation
:return: List of generated samples
"""
if self.async_mode:
return asyncio.get_event_loop().run_until_complete(
self._augment_async(dataset, split_name, n_generations, update_split, batch_size)
)

original_split = dataset[split_name]
new_samples = []
for intent in dataset.intents:
generated_utterances = self(
intent_data=intent,
n_generations=n_generations,
generated_utterances = self(intent_data=intent, n_generations=n_generations)
new_samples.extend(
[{Dataset.label_feature: intent.id, Dataset.utterance_feature: ut} for ut in generated_utterances]
)

if update_split:
generated_split = HFDataset.from_list(new_samples)
dataset[split_name] = concatenate_datasets([original_split, generated_split])

return [Sample(**sample) for sample in new_samples]

async def _augment_async(
self,
dataset: Dataset,
split_name: str = Split.TRAIN,
n_generations: int = 5,
update_split: bool = True,
batch_size: int = 4,
) -> list[Sample]:
"""
Augment some split of dataset asynchronously in batches.

:param dataset: Dataset object
:param split_name: Dataset split (default is TRAIN)
:param n_generations: Number of utterances to generate per intent
:param update_split: Whether to update the dataset split
:param batch_size: Batch size for async generation
:return: List of generated samples
"""
original_split = dataset[split_name]
new_samples = []

results = []
for start_idx in range(0, len(dataset.intents), batch_size):
batch_intents = dataset.intents[start_idx : start_idx + batch_size]
tasks = [self._call_async(intent_data=intent, n_generations=n_generations) for intent in batch_intents]
batch_results = await asyncio.gather(*tasks)
results.extend(batch_results)

for i, generated_utterances in enumerate(results):
intent = dataset.intents[i]
new_samples.extend(
[{Dataset.label_feature: intent.id, Dataset.utterance_feature: ut} for ut in generated_utterances]
)

if update_split:
generated_split = HFDataset.from_list(new_samples)
dataset[split_name] = concatenate_datasets([original_split, generated_split])

return [Sample(**sample) for sample in new_samples]


Expand All @@ -68,4 +128,4 @@ def _extract_utterances(response_text: str) -> list[str]:
"""
raw_utterances = response_text.split("\n")
# remove enumeration
return [ut[ut.find(" ") + 1 :] for ut in raw_utterances]
return [ut[ut.find(" ") + 1 :] if " " in ut else ut for ut in raw_utterances]
23 changes: 21 additions & 2 deletions autointent/generation/utterances/evolution/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,23 @@
from .chat_templates import AbstractEvolution, ConcreteEvolution, EvolutionChatTemplate, ReasoningEvolution
from .chat_templates import (
AbstractEvolution,
ConcreteEvolution,
EvolutionChatTemplate,
FormalEvolution,
FunnyEvolution,
GoofyEvolution,
InformalEvolution,
ReasoningEvolution,
)
from .evolver import UtteranceEvolver

__all__ = ["AbstractEvolution", "ConcreteEvolution", "EvolutionChatTemplate", "ReasoningEvolution", "UtteranceEvolver"]
__all__ = [
"AbstractEvolution",
"ConcreteEvolution",
"EvolutionChatTemplate",
"FormalEvolution",
"FunnyEvolution",
"GoofyEvolution",
"InformalEvolution",
"ReasoningEvolution",
"UtteranceEvolver",
]
Original file line number Diff line number Diff line change
@@ -1,6 +1,19 @@
from .abstract import AbstractEvolution
from .base import EvolutionChatTemplate
from .concrete import ConcreteEvolution
from .formal import FormalEvolution
from .funny import FunnyEvolution
from .goofy import GoofyEvolution
from .informal import InformalEvolution
from .reasoning import ReasoningEvolution

__all__ = ["AbstractEvolution", "ConcreteEvolution", "EvolutionChatTemplate", "ReasoningEvolution"]
__all__ = [
"AbstractEvolution",
"ConcreteEvolution",
"EvolutionChatTemplate",
"FormalEvolution",
"FunnyEvolution",
"GoofyEvolution",
"InformalEvolution",
"ReasoningEvolution",
]
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ class AbstractEvolution(EvolutionChatTemplate):
),
Message(role=Role.ASSISTANT, content="Please, reserve a table for me."),
Message(
role=Role.ASSISTANT,
role=Role.USER,
content=(
"Intent name: requesting technical support"
"Intent name: requesting technical support\n"
"Utterance: My Lenovo laptop is constantly rebooting and overheating."
),
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ class ConcreteEvolution(EvolutionChatTemplate):
Message(role=Role.ASSISTANT, content="I want to reserve a table for 4 persons at 9 pm."),
Message(
role=Role.USER,
content=("Intent name: requesting technical support\n" "Utterance: I'm having trouble with my laptop."),
content=(
"Intent name: requesting technical support\n"
"Utterance: I'm having trouble with my laptop."
),
),
Message(role=Role.ASSISTANT, content="My laptop is constantly rebooting and overheating."),
]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Chat template for formal tone augmentation."""

from typing import ClassVar

from autointent.generation.utterances.schemas import Message, Role
from autointent.schemas import Intent

from .base import EvolutionChatTemplate


class FormalEvolution(EvolutionChatTemplate):
"""Chat template for formal tone augmentation."""

_messages: ClassVar[list[Message]] = [
Message(
role=Role.USER,
content=(
"I want you to act as a rewriter. "
"You will be provided with an utterance and the topic (name of intent class) of the utterance. "
"You need to rewrite the utterance in a more formal tone using the following method:\n"
"1. Rewrite the utterance in a more formal tone.\n"
"2. Use polite and professional language while maintaining clarity.\n"
"3. The rewritten utterance should be grammatically correct and complete.\n"
"4. Keep the rewritten utterance within 15 words.\n\n"
"Intent name: Reserve Restaurant"
"Utterance: I want to reserve a table for 4 persons at 9 pm."
),
),
Message(role=Role.ASSISTANT, content="I would like to make a reservation for four guests at 9 pm."),
Message(
role=Role.ASSISTANT,
content=(
"Intent name: requesting technical support\n"
"Utterance: My Lenovo laptop is constantly rebooting and overheating."
),
),
Message(
role=Role.ASSISTANT,
content="My Lenovo laptop frequently restarts and experiences overheating issues. Kindly assist.",
),
]

def __call__(self, utterance: str, intent_data: Intent) -> list[Message]:
"""Generate chat for formal tone adaptation."""
return [
*self._messages,
Message(role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}"),
]
46 changes: 46 additions & 0 deletions autointent/generation/utterances/evolution/chat_templates/funny.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Chat template for humorous tone augmentation."""

from typing import ClassVar

from autointent.generation.utterances.schemas import Message, Role
from autointent.schemas import Intent

from .base import EvolutionChatTemplate


class FunnyEvolution(EvolutionChatTemplate):
"""Chat template for humorous tone augmentation."""

_messages: ClassVar[list[Message]] = [
Message(
role=Role.USER,
content=(
"I want you to act as a rewriter. "
"You will be provided with an utterance and the topic (name of intent class) of the utterance. "
"You need to rewrite the utterance in a humorous way while maintaining its original meaning using "
"the following method:\n"
"1. Rewrite the utterance in a humorous way while maintaining its original meaning.\n"
"2. Use wordplay, exaggeration, or lighthearted phrasing.\n"
"3. The rewritten utterance should still be understandable and relevant.\n"
"4. Keep it within 15 words.\n\n"
"Intent name: Reserve Restaurant"
"Utterance: I want to reserve a table for 4 persons at 9 pm."
),
),
Message(role=Role.ASSISTANT, content="Gotta feed my squad at 9 pm. Got a table for us?"),
Message(
role=Role.USER,
content=(
"Intent name: requesting technical support\n"
"Utterance: My Lenovo laptop is constantly rebooting and overheating."
),
),
Message(role=Role.ASSISTANT, content="My Lenovo thinks it's a phoenix—keeps dying and rising in flames."),
]

def __call__(self, utterance: str, intent_data: Intent) -> list[Message]:
"""Generate chat for humorous tone adaptation."""
return [
*self._messages,
Message(role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}"),
]
Loading