аугментация с целью балансировки датасета (#148)

SeBorgey · web-flow · commit a5b777e07068 · 2025-02-27T19:06:42.000+03:00
* skeleton of code balancer

* first working tested balancer

* review changes

* fix autocheck

* fixcheck 2: return of the fixcheck

* fix after second review

* fix after pull dev

* check autofix: episode 3
diff --git a/autointent/generation/utterances/__init__.py b/autointent/generation/utterances/__init__.py
@@ -1,3 +1,4 @@
+from .balancer import DatasetBalancer
 from .basic import EnglishSynthesizerTemplate, RussianSynthesizerTemplate, UtteranceGenerator
 from .evolution import (
     AbstractEvolution,
@@ -16,6 +17,7 @@
 __all__ = [
     "AbstractEvolution",
     "ConcreteEvolution",
+    "DatasetBalancer",
     "EvolutionChatTemplate",
     "FormalEvolution",
     "FunnyEvolution",
diff --git a/autointent/generation/utterances/balancer.py b/autointent/generation/utterances/balancer.py
@@ -0,0 +1,154 @@
+"""Module for balancing datasets through augmentation of underrepresented classes."""
+
+import logging
+from collections import defaultdict
+
+from datasets import Dataset as HFDataset
+
+from autointent import Dataset
+from autointent.custom_types import Split
+from autointent.generation.utterances.basic.chat_templates._base import BaseSynthesizerTemplate
+from autointent.generation.utterances.basic.utterance_generator import UtteranceGenerator
+from autointent.generation.utterances.generator import Generator
+
+logger = logging.getLogger(__name__)
+
+
+class DatasetBalancer:
+    """Class for balancing dataset through example augmentation."""
+
+    def __init__(
+        self,
+        generator: Generator,
+        prompt_maker: BaseSynthesizerTemplate,
+        async_mode: bool = False,
+        max_samples_per_class: int | None = None,
+    ) -> None:
+        """
+        Initialize the UtteranceBalancer.
+
+        Args:
+            generator (Generator): The generator object used to create utterances.
+            prompt_maker (Callable[[Intent, int], list[Message]]): A callable that creates prompts for the generator.
+            seed (int, optional): The seed for random number generation. Defaults to 42.
+            async_mode (bool, optional): Whether to run the generator in asynchronous mode. Defaults to False.
+            max_samples_per_class (int | None, optional): The maximum number of samples per class.
+                Must be a positive integer or None. Defaults to None.
+        Raises:
+            ValueError: If max_samples_per_class is not None and is less than or equal to 0.
+        """
+        if max_samples_per_class is not None and max_samples_per_class <= 0:
+            msg = "max_samples_per_class must be a positive integer or None"
+            raise ValueError(msg)
+
+        self.utterance_generator = UtteranceGenerator(
+            generator=generator, prompt_maker=prompt_maker, async_mode=async_mode
+        )
+        self.max_samples = max_samples_per_class
+
+    def balance(self, dataset: Dataset, split: str = Split.TRAIN, batch_size: int = 4) -> Dataset:
+        """
+        Balances the specified dataset split.
+
+        :param dataset: Source dataset
+        :param split: Target split for balancing
+        :param n_evolutions: Number of augmentations per example
+        :param batch_size: Batch size for asynchronous processing
+        :return: Balanced dataset
+        """
+        if dataset.multilabel:
+            msg = "Method supports only single-label datasets"
+            raise ValueError(msg)
+
+        class_counts = self._count_class_examples(dataset, split)
+        max_count = max(class_counts.values())
+        target_count = self.max_samples if self.max_samples is not None else max_count
+        logger.debug("Target count per class: %s", target_count)
+        for class_id, current_count in class_counts.items():
+            if current_count < target_count:
+                needed = target_count - current_count
+                self._augment_class(dataset, split, class_id, needed, batch_size)
+
+        return dataset
+
+    def _count_class_examples(self, dataset: Dataset, split: str) -> dict[int, int]:
+        """Count the number of examples for each class."""
+        counts: dict[int, int] = defaultdict(int)
+        for sample in dataset[split]:
+            counts[sample[Dataset.label_feature]] += 1
+        return counts
+
+    def _augment_class(self, dataset: Dataset, split: str, class_id: int, needed: int, batch_size: int) -> None:
+        """Generate additional examples for the class."""
+        intent = next(i for i in dataset.intents if i.id == class_id)
+        class_name = getattr(intent, "name", f"class_{class_id}")
+        logger.debug("Starting augmentation for class %s (%s)", class_id, class_name)
+        logger.debug("Initial samples: %s", len([s for s in dataset[split] if s[Dataset.label_feature] == class_id]))
+        logger.debug("Target needed: %s samples", needed)
+
+        class_samples = [s for s in dataset[split] if s[Dataset.label_feature] == class_id]
+        if not class_samples:
+            msg = f"No samples for class {class_id}"
+            raise ValueError(msg)
+
+        generated_utterances: list[str] = []
+        max_attempts = 5
+        attempts = 0
+
+        while len(generated_utterances) < needed and attempts < max_attempts:
+            current_needed = needed - len(generated_utterances)
+            current_batch = min(batch_size, current_needed)
+            logger.debug("Attempt %s: Generating %s utterances for class %s", attempts + 1, current_batch, class_id)
+
+            new_utterances = self.utterance_generator(intent_data=intent, n_generations=current_batch)
+
+            valid_utterances = self._process_utterances(new_utterances)
+            for ut in valid_utterances:
+                if ut and isinstance(ut, str):
+                    generated_utterances.append(ut)
+                    if len(generated_utterances) >= needed:
+                        break
+
+            logger.debug("Generated %s valid utterances in this attempt", len(valid_utterances))
+            logger.debug(
+                "Progress: %s/%s (%s%%)",
+                len(generated_utterances),
+                needed,
+                min(100, int(len(generated_utterances) / needed * 100)),
+            )
+
+            attempts += 1
+
+        if len(generated_utterances) < needed:
+            logger.debug(
+                "Warning: Could only generate %s/%s utterances after %s attempts",
+                len(generated_utterances),
+                needed,
+                max_attempts,
+            )
+
+        generated_utterances = generated_utterances[:needed]
+
+        new_samples = []
+        for utterance in generated_utterances:
+            new_sample = {Dataset.utterance_feature: utterance, Dataset.label_feature: class_id}
+            new_samples.append(new_sample)
+
+        updated_data = list(dataset[split]) + new_samples
+        dataset[split] = HFDataset.from_list(updated_data)
+
+        final_count = len([s for s in dataset[split] if s[Dataset.label_feature] == class_id])
+        logger.debug("Completed augmentation for class %s (%s)", class_id, class_name)
+        logger.debug("Total samples after augmentation: %s", final_count)
+
+    def _process_utterances(self, generated: list[str]) -> list[str]:
+        """Process and clean generated utterances."""
+        processed = []
+        for ut in generated:
+            if "', '" in ut or "',\n" in ut:
+                clean_ut = ut.replace("[", "").replace("]", "").replace("'", "")
+                split_ut = [u.strip() for u in clean_ut.split(", ") if u.strip()]
+                processed.extend(split_ut)
+            else:
+                processed.append(ut.strip())
+        return processed
diff --git a/tests/generation/utterances/test_balancer.py b/tests/generation/utterances/test_balancer.py
@@ -0,0 +1,101 @@
+import logging
+import os
+from collections import defaultdict
+from unittest.mock import AsyncMock, Mock, patch
+
+import pytest
+
+from autointent import Dataset
+from autointent.custom_types import Split
+from autointent.generation.utterances import DatasetBalancer, Generator
+from autointent.generation.utterances.basic.chat_templates._synthesizer_en import EnglishSynthesizerTemplate
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture
+def mock_generator():
+    generator = Mock(spec=Generator)
+    generator.get_chat_completion.return_value = "test_utterance"
+    generator.get_chat_completion_async = AsyncMock(return_value="test_utterance")
+    return generator
+
+
+@pytest.fixture
+def mock_prompt_maker():
+    return Mock(return_value=[Mock()])
+
+
+@pytest.fixture
+def unbalanced_dataset():
+    return Dataset.from_dict(
+        {
+            "intents": [{"id": 0, "name": "A"}, {"id": 1, "name": "B"}],
+            "train": [
+                {"utterance": "test a1", "label": 0},
+                {"utterance": "test a2", "label": 0},
+                {"utterance": "test b1", "label": 1},
+            ],
+        }
+    )
+
+
+def test_balancer(unbalanced_dataset, mock_generator, mock_prompt_maker):
+    balancer = DatasetBalancer(generator=mock_generator, prompt_maker=mock_prompt_maker)
+    logger.info("Before balancing:")
+    for sample in unbalanced_dataset[Split.TRAIN]:
+        logger.info("Utterance: %s, Label: %s", sample["utterance"], sample["label"])
+
+    with patch.object(balancer.utterance_generator, "__call__") as mock_call:
+        mock_call.return_value = ["generated_utterance"]
+
+        balanced = balancer.balance(unbalanced_dataset)
+
+    logger.info("After balancing:")
+    for sample in balanced[Split.TRAIN]:
+        logger.info("Utterance: %s, Label: %s", sample["utterance"], sample["label"])
+
+    labels = [s["label"] for s in balanced[Split.TRAIN]]
+    assert labels.count(0) == 2, "Class 0 should not change"
+    assert labels.count(1) == 2, "Class 1 should increase to 2"
+    assert len(labels) == 4, "The total number of examples should be 4"
+
+    original_utterances = {s["utterance"] for s in unbalanced_dataset[Split.TRAIN]}
+    balanced_utterances = {s["utterance"] for s in balanced[Split.TRAIN]}
+    assert original_utterances.issubset(balanced_utterances)
+
+
+@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Requires OpenAI API key in environment")
+def test_real_balancer():
+    test_data = {
+        "intents": [{"id": 0, "name": "Book restaurant"}, {"id": 1, "name": "Check weather"}],
+        "train": [
+            {"utterance": "Book a table for two", "label": 0},
+            {"utterance": "Reserve a table", "label": 0},
+            {"utterance": "What's the weather in Moscow?", "label": 1},
+        ],
+    }
+    dataset = Dataset.from_dict(test_data)
+    template = EnglishSynthesizerTemplate(dataset, split="train")
+    generator = Generator()
+    evolutions = template
+    balancer = DatasetBalancer(generator=generator, prompt_maker=evolutions, max_samples_per_class=3, async_mode=False)
+
+    logger.info("Starting balance process...")
+    balanced = balancer.balance(dataset)
+
+    class_counts = defaultdict(int)
+    for sample in balanced[Split.TRAIN]:
+        class_counts[sample["label"]] += 1
+
+    logger.info("Balancing results:")
+    logger.info("Class 0 count: %s", class_counts[0])
+    logger.info("Class 1 count: %s", class_counts[1])
+    logger.info("Generated examples:")
+    for sample in balanced[Split.TRAIN]:
+        if sample["utterance"] not in {s["utterance"] for s in test_data["train"]}:
+            logger.info("[Class %s]: %s", sample["label"], sample["utterance"])
+
+    assert class_counts[0] == 3, "Class 0 should have 3 examples"
+    assert class_counts[1] == 3, "Class 1 should have 3 examples"
+    assert len(balanced[Split.TRAIN]) == 6, "Total examples should be 6"