adversarial augmentation

Tetragrammaton123 · Tetragrammaton123 · commit 3c5ed1809b8d · 2025-07-24T20:23:40.000+05:00
diff --git a/autointent/generation/utterances/_adversarial/__init__.py b/autointent/generation/utterances/_adversarial/__init__.py
@@ -0,0 +1,4 @@
+from .critic_human_like import CriticHumanLike
+from .human_utterance_generator import HumanUtteranceGenerator
+
+__all__ = ["CriticHumanLike", "HumanUtteranceGenerator"]
diff --git a/autointent/generation/utterances/_adversarial/critic_human_like.py b/autointent/generation/utterances/_adversarial/critic_human_like.py
@@ -0,0 +1,74 @@
+"""CriticHumanLike class for distinguishing human vs generated utterances."""
+
+from typing import Literal
+
+from pydantic import BaseModel
+
+from autointent.generation import Generator
+from autointent.generation.chat_templates import Message, Role
+
+
+class CriticResponse(BaseModel):
+    """Structured answer."""
+    reasoning: str
+    label: Literal["human", "generated"]
+    explanation: str
+
+class CriticHumanLike:
+    """A simple critic class that classifies user utterances as either 'human' or 'generated'.
+
+    using an LLM-based binary classifier prompt.
+    """
+
+    def __init__(self, generator: Generator)-> None:
+        """Initialize the CriticFirst.
+
+        Args:
+            generator: Wrapper for the LLM API to generate classification responses.
+        """
+        self.generator = generator
+
+    def build_classification_prompt(self, example: str, intent_name: str) -> Message:
+        """Args.
+
+            example: The user utterance to classify.
+            intent_name: The name of the intent associated with the utterance.
+
+        Returns:
+            Message: A formatted message prompt for classification.
+        """
+        content = (
+            "You are a critic that determines whether a user utterance was written by a human or "
+            "generated by a language model.\n\n"
+            f"Intent: {intent_name}\n"
+            f'Utterance: "{example}"\n\n'
+            "Respond in **JSON format** with three keys:\n"
+            "- `reasoning`: a short chain-of-thought where you explain your logic\n"
+            "- `label`: must be either `human` or `generated`\n"
+            "- `explanation`: a concise summary of your decision\n\n"
+            "Example:\n"
+            "{\n"
+            '  "reasoning": "The phrasing includes casual contractions and natural hesitation. The utterance '
+            'flows similarly to how a human would speak spontaneously.",\n'
+            '  "label": "human",\n'
+            '  "explanation": "The utterance includes natural hesitation and informal phrasing '
+            'typical of human speech."\n'
+            "}"
+        )
+        return Message(role=Role.USER, content=content)
+
+    def is_human(self, utterance: str, intent_name: str) -> bool:
+        """Args.
+
+            utterance: The utterance to evaluate.
+            intent_name: The associated intent.
+
+        Returns:
+            bool: True if classified as human, False otherwise.
+        """
+        messages = self.build_classification_prompt(utterance, intent_name)
+        response: CriticResponse = self.generator.get_structured_output_sync(
+        messages=messages,
+        output_model=CriticResponse,
+        max_retries=3)
+        return response.label == "human"
diff --git a/autointent/generation/utterances/_adversarial/human_utterance_generator.py b/autointent/generation/utterances/_adversarial/human_utterance_generator.py
@@ -0,0 +1,112 @@
+from collections import defaultdict
+
+from datasets import Dataset as HFDataset
+from datasets import concatenate_datasets
+
+from autointent import Dataset
+from autointent.custom_types import Split
+from autointent.generation import Generator
+from autointent.generation.chat_templates._evolution_templates_schemas import Message, Role
+from autointent.schemas import Sample
+
+from .critic_human_like import CriticHumanLike
+
+
+class HumanUtteranceGenerator:
+    """Generator of human-like utterances.
+
+    This class rewrites given user utterances to make them sound more natural and human-like,
+    while preserving their original intent. The generation process is iterative and attempts
+    to bypass a critic that identifies machine-generated text.
+    """
+
+    def __init__(self, generator: Generator, critic: CriticHumanLike)-> None:
+        """Initialize the CritlUtteranceGenerator.
+
+        Args:
+            generator: Wrapper for the LLM API used to generate utterances.
+            critic: Critic to determine whether the generated utterance sounds human-like.
+        """
+        self.generator = generator
+        self.critic = critic
+
+    def augment(
+            self,
+            dataset: Dataset,
+            split_name: str = Split.TRAIN,
+            update_split: bool = True,
+            n_final_per_class: int = 5
+    ) -> list[Sample]:
+        """Generate human-like utterances for each intent by iteratively refining machine-generated candidates.
+
+        Args:
+            dataset: The dataset to augment.
+            split_name: The name of the split to augment (e.g., 'train').
+            update_split: Whether to update the dataset split with the new utterances.
+            n_final_per_class: Number of successful utterances to generate per intent.
+
+        Returns:
+            list[Sample]: List of newly generated samples.
+        """
+        original_split = dataset[split_name]
+        id_to_name = {intent.id: intent.name for intent in dataset.intents}
+        new_samples = []
+
+        class_to_samples = defaultdict(list)
+        for sample in original_split:
+            class_to_samples[sample["label"]].append(sample["utterance"])
+
+        for intent_id, intent_name in id_to_name.items():
+            generated_count = 0
+            attempt = 0
+
+            seed_utterances = class_to_samples.get(intent_id, [])
+            if not seed_utterances:
+                continue
+
+            while generated_count < n_final_per_class and attempt < n_final_per_class * 3:
+                attempt += 1
+                seed = seed_utterances[attempt % len(seed_utterances)]
+                rejected = []
+
+                for _ in range(3):
+                    prompt = self._build_adversarial_prompt(seed, intent_name, rejected)
+                    generated = self.generator.get_chat_completion([prompt]).strip()
+
+                    if self.critic.is_human(generated, intent_name):
+                        new_samples.append({
+                            Dataset.label_feature: intent_id,
+                            Dataset.utterance_feature: generated
+                        })
+                        generated_count += 1
+                        break
+                    rejected.append(generated)
+
+        if update_split:
+            generated_split = HFDataset.from_list(new_samples)
+            dataset[split_name] = concatenate_datasets([original_split, generated_split])
+
+        return [Sample(**sample) for sample in new_samples]
+
+    def _build_adversarial_prompt(self, example: str, intent_name: str, rejected: list[str]) -> Message:
+        """Build an adversarial prompt to guide the model in generating more human-like utterances.
+
+        Args:
+            example: The original utterance to be modified.
+            intent_name: The intent of the utterance.
+            rejected: List of previously rejected generations.
+
+        Returns:
+            Message: A formatted prompt guiding the generator to improve naturalness.
+        """
+        rejected_block = "\n".join(f"- {r}" for r in rejected) if rejected else "None"
+        content = (
+            "Your task is to rewrite the following user utterance so that it sounds as natural "
+            "and human-like as possible, while preserving its original intent: "
+            f"'{intent_name}'.\n\n"
+            f'Original utterance: "{example}"\n\n'
+            f"The following previous attempts were classified as machine-generated and rejected:\n{rejected_block}\n\n"
+            "Try to write something that would pass as written by a real human. Output a single version only.\n"
+            "IMPORTANT: You must modify the original utterance."
+        )
+        return Message(role=Role.USER, content=content)