Add few shot (#187)

Samoed · github-actions[bot] · web-flow · commit 791669eb3254 · 2025-04-21T14:33:23.000+03:00
* init few shot

* Update optimizer_config.schema.json

* apply few shot to all

* Update optimizer_config.schema.json

* fix test

* lint

---------

Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py
@@ -21,14 +21,22 @@ class DataConfig(BaseModel):
     validation_size: FloatFromZeroToOne = Field(
         0.2,
         description=(
-            "Fraction of train samples to allocate for validation (if input dataset doesn't contain validation split)."
+            "Fraction of train samples to allocate for validation (if input dataset doesn't contain validation split). "
+            "If `is_few_shot_train` is True, this value will be ignored."
         ),
     )
     """Fraction of train samples to allocate for validation (if input dataset doesn't contain validation split)."""
     separation_ratio: FloatFromZeroToOne | None = Field(
         0.5, description="Set to float to prevent data leak between scoring and decision nodes."
     )
     """Set to float to prevent data leak between scoring and decision nodes."""
+    is_few_shot_train: bool = Field(False, description="Whether to use few-shot training.")
+    """Whether to use few-shot training."""
+    examples_per_intent: PositiveInt = Field(
+        8,
+        description="Number of examples per intent for few-shot validation. If None, all examples will be used.",
+    )
+    """Number of examples per intent for few-shot validation. If None, all examples will be used."""
 
 
 class LoggingConfig(BaseModel):
diff --git a/autointent/context/data_handler/_data_handler.py b/autointent/context/data_handler/_data_handler.py
@@ -11,7 +11,7 @@
 from autointent.custom_types import FloatFromZeroToOne, ListOfGenericLabels, ListOfLabels, Split
 from autointent.schemas import Tag
 
-from ._stratification import split_dataset
+from ._stratification import create_few_shot_split, split_dataset
 
 logger = logging.getLogger(__name__)
 
@@ -48,9 +48,14 @@ def __init__(
         self._n_classes = self.dataset.n_classes
 
         if self.config.scheme == "ho":
-            self._split_ho(self.config.separation_ratio, self.config.validation_size)
+            self._split_ho(
+                self.config.separation_ratio,
+                self.config.validation_size,
+                self.config.is_few_shot_train,
+                self.config.examples_per_intent,
+            )
         elif self.config.scheme == "cv":
-            self._split_cv()
+            self._split_cv(self.config.is_few_shot_train, self.config.examples_per_intent)
 
         self._logger = logger
 
@@ -149,8 +154,8 @@ def test_labels(self) -> ListOfGenericLabels:
 
     def validation_iterator(self) -> Generator[tuple[list[str], ListOfLabels, list[str], ListOfLabels]]:
         """Yield folds for cross-validation."""
-        if self.config.scheme == "ho":
-            msg = "Cannot call cross-validation on hold-out DataHandler"
+        if self.config.scheme != "cv":
+            msg = f"Cannot call cross-validation on {self.config.scheme} DataHandler"
             raise RuntimeError(msg)
 
         for j in range(self.config.n_folds):
@@ -165,14 +170,22 @@ def validation_iterator(self) -> Generator[tuple[list[str], ListOfLabels, list[s
             train_labels = [lab for lab in train_labels if lab is not None]
             yield train_utterances, train_labels, val_utterances, val_labels  # type: ignore[misc]
 
-    def _split_ho(self, separation_ratio: FloatFromZeroToOne | None, validation_size: FloatFromZeroToOne) -> None:
+    def _split_ho(
+        self,
+        separation_ratio: FloatFromZeroToOne | None,
+        validation_size: FloatFromZeroToOne,
+        is_few_shot: bool,
+        examples_per_intent: int,
+    ) -> None:
         has_validation_split = any(split.startswith(Split.VALIDATION) for split in self.dataset)
 
         if separation_ratio is not None and Split.TRAIN in self.dataset:
             self._split_train(separation_ratio)
 
         if not has_validation_split:
-            self._split_validation_from_train(validation_size)
+            self._split_validation_from_train(validation_size, is_few_shot, examples_per_intent)
+        elif is_few_shot:
+            self._split_few_shot(examples_per_intent)
 
         for split in self.dataset:
             n_classes_in_split = self.dataset.get_n_classes(split)
@@ -182,6 +195,27 @@ def _split_ho(self, separation_ratio: FloatFromZeroToOne | None, validation_size
                 )
                 raise ValueError(message)
 
+    def _split_few_shot(self, examples_per_intent: int) -> None:
+        if Split.TRAIN in self.dataset:
+            self.dataset[Split.TRAIN], self.dataset[Split.VALIDATION] = create_few_shot_split(
+                self.dataset[Split.TRAIN],
+                self.dataset[Split.VALIDATION],
+                multilabel=self.dataset.multilabel,
+                label_column=self.dataset.label_feature,
+                random_seed=self._seed,
+                examples_per_label=examples_per_intent,
+            )
+        else:
+            for idx in range(2):
+                self.dataset[f"{Split.TRAIN}_{idx}"], self.dataset[f"{Split.VALIDATION}_{idx}"] = create_few_shot_split(
+                    self.dataset[f"{Split.TRAIN}_{idx}"],
+                    self.dataset[f"{Split.VALIDATION}_{idx}"],
+                    multilabel=self.dataset.multilabel,
+                    label_column=self.dataset.label_feature,
+                    random_seed=self._seed,
+                    examples_per_label=examples_per_intent,
+                )
+
     def _split_train(self, ratio: FloatFromZeroToOne) -> None:
         """Split on two sets.
 
@@ -199,7 +233,7 @@ def _split_train(self, ratio: FloatFromZeroToOne) -> None:
         )
         self.dataset.pop(Split.TRAIN)
 
-    def _split_cv(self) -> None:
+    def _split_cv(self, is_few_shot: bool, examples_per_intent: int) -> None:
         extra_splits = [split_name for split_name in self.dataset if split_name != Split.TEST]
         self.dataset[Split.TRAIN] = concatenate_datasets([self.dataset.pop(split_name) for split_name in extra_splits])
 
@@ -209,17 +243,21 @@ def _split_cv(self) -> None:
                 split=Split.TRAIN,
                 test_size=1 / (self.config.n_folds - j),
                 random_seed=self._seed,
+                is_few_shot=is_few_shot,
+                examples_per_intent=examples_per_intent,
                 allow_oos_in_train=True,
             )
         self.dataset[f"{Split.TRAIN}_{self.config.n_folds - 1}"] = self.dataset.pop(Split.TRAIN)
 
-    def _split_validation_from_train(self, size: float) -> None:
+    def _split_validation_from_train(self, size: float, is_few_shot: bool, examples_per_intent: int) -> None:
         if Split.TRAIN in self.dataset:
             self.dataset[Split.TRAIN], self.dataset[Split.VALIDATION] = split_dataset(
                 self.dataset,
                 split=Split.TRAIN,
                 test_size=size,
                 random_seed=self._seed,
+                is_few_shot=is_few_shot,
+                examples_per_intent=examples_per_intent,
                 allow_oos_in_train=True,
             )
         else:
@@ -229,6 +267,8 @@ def _split_validation_from_train(self, size: float) -> None:
                     split=f"{Split.TRAIN}_{idx}",
                     test_size=size,
                     random_seed=self._seed,
+                    is_few_shot=is_few_shot,
+                    examples_per_intent=examples_per_intent,
                     allow_oos_in_train=idx == 1,  # for decision node it's ok to have oos in train
                 )
 
diff --git a/autointent/context/data_handler/_stratification.py b/autointent/context/data_handler/_stratification.py
@@ -4,6 +4,7 @@
 It includes support for both single-label and multi-label stratified splitting.
 """
 
+import logging
 from collections.abc import Sequence
 
 import numpy as np
@@ -17,6 +18,8 @@
 from autointent import Dataset
 from autointent.custom_types import LabelType
 
+logger = logging.getLogger(__name__)
+
 
 class StratifiedSplitter:
     """A class for stratified splitting of datasets.
@@ -32,6 +35,8 @@ def __init__(
         label_feature: str,
         random_seed: int | None,
         shuffle: bool = True,
+        is_few_shot: bool = False,
+        examples_per_label: int = 8,
     ) -> None:
         """Initialize the StratifiedSplitter.
 
@@ -40,11 +45,15 @@ def __init__(
             label_feature: Name of the feature containing labels for stratification.
             random_seed: Seed for random number generation to ensure reproducibility.
             shuffle: Whether to shuffle the data before splitting.
+            is_few_shot: Whether the dataset is a few-shot dataset.
+            examples_per_label: Number of examples per label for few-shot datasets.
         """
         self.test_size = test_size
         self.label_feature = label_feature
         self.random_seed = random_seed
         self.shuffle = shuffle
+        self.is_few_shot = is_few_shot
+        self.examples_per_label = examples_per_label
 
     def __call__(
         self, dataset: HFDataset, multilabel: bool, allow_oos_in_train: bool | None = None
@@ -71,7 +80,16 @@ def __call__(
             )
             raise ValueError(msg)
         splitter = self._split_allow_oos_in_train if allow_oos_in_train else self._split_disallow_oos_in_train
-        return splitter(dataset, multilabel)
+        train, test = splitter(dataset, multilabel)
+        if self.is_few_shot:
+            train, test = create_few_shot_split(
+                train,
+                test,
+                multilabel=multilabel,
+                label_column=self.label_feature,
+                examples_per_label=self.examples_per_label,
+            )
+        return train, test
 
     def _has_oos_samples(self, dataset: HFDataset) -> bool:
         """Check if the dataset contains out-of-scope samples.
@@ -287,6 +305,8 @@ def split_dataset(
     split: str,
     test_size: float,
     random_seed: int | None,
+    is_few_shot: bool = False,
+    examples_per_intent: int = 8,
     allow_oos_in_train: bool | None = None,
 ) -> tuple[HFDataset, HFDataset]:
     """Split a Dataset object into training and testing subsets.
@@ -296,6 +316,8 @@ def split_dataset(
         split: The specific data split to divide.
         test_size: Proportion of the dataset to include in the test split.
         random_seed: Seed for random number generation.
+        is_few_shot: Whether the dataset is a few-shot dataset.
+        examples_per_intent: Number of examples per label for few-shot datasets.
         allow_oos_in_train: Whether to allow OOS samples in train split.
 
     Returns:
@@ -305,5 +327,74 @@ def split_dataset(
         test_size=test_size,
         label_feature=dataset.label_feature,
         random_seed=random_seed,
+        is_few_shot=is_few_shot,
+        examples_per_label=examples_per_intent,
     )
     return splitter(dataset[split], dataset.multilabel, allow_oos_in_train=allow_oos_in_train)
+
+
+def create_few_shot_split(
+    train_dataset: HFDataset,
+    validation_dataset: HFDataset,
+    label_column: str,
+    examples_per_label: int = 8,
+    multilabel: bool = False,
+    random_seed: int | None = None,
+) -> tuple[HFDataset, HFDataset]:
+    """Create a few-shot dataset split with a specified number of examples per label.
+
+    Args:
+        train_dataset: A Hugging Face dataset or DatasetDict
+        validation_dataset: A Hugging Face dataset or DatasetDict
+        label_column: The name of the column containing labels (default: 'label')
+        examples_per_label: Number of examples to include per label in the train split (default: 8)
+        multilabel: Whether the dataset is multi-label (default: False)
+        random_seed: Random seed for reproducibility (default: 42)
+
+    Returns:
+        A tuple containing the train and validation datasets.
+    """
+    # Add a unique index column to track examples
+    train_dataset = train_dataset.add_column("__index__", list(range(len(train_dataset))))
+    if multilabel:
+        _unique_labels = set()
+        for example in train_dataset:
+            if example[label_column] is not None:
+                _unique_labels.add(tuple(example[label_column]))
+        unique_labels = list(_unique_labels)
+    else:
+        unique_labels = train_dataset.unique(label_column)
+
+    # Create train dataset by sampling examples_per_label for each label
+    train_datasets = []
+    selected_indices = set()
+
+    for label in unique_labels:
+        if multilabel:
+            label_examples = train_dataset.filter(lambda row: tuple(row[label_column]) == label)  # noqa: B023
+        else:
+            label_examples = train_dataset.filter(lambda row: row[label_column] == label)  # noqa: B023
+        label_examples = label_examples.shuffle(seed=random_seed)
+
+        num_to_select = min(examples_per_label, len(label_examples))
+        selected_examples = label_examples.select(range(num_to_select))
+
+        if num_to_select < examples_per_label:
+            msg = (
+                f"Warning: Only {num_to_select} examples available for label '{label}', "
+                f"which is less than the requested {examples_per_label}"
+            )
+            logger.warning(msg)
+
+        train_datasets.append(selected_examples)
+        selected_indices.update([ex["__index__"] for ex in selected_examples])
+
+    # Create validation split with remaining examples
+    extra_validation_dataset = train_dataset.filter(
+        lambda example: example["__index__"] not in selected_indices
+    ).remove_columns("__index__")
+
+    validation_dataset = concatenate_datasets([validation_dataset, extra_validation_dataset])
+    train_dataset = concatenate_datasets(train_datasets).remove_columns("__index__")
+
+    return train_dataset, validation_dataset
diff --git a/docs/optimizer_config.schema.json b/docs/optimizer_config.schema.json
@@ -71,7 +71,7 @@
                 },
                 "validation_size": {
                     "default": 0.2,
-                    "description": "Fraction of train samples to allocate for validation (if input dataset doesn't contain validation split).",
+                    "description": "Fraction of train samples to allocate for validation (if input dataset doesn't contain validation split). If `is_few_shot_train` is True, this value will be ignored.",
                     "maximum": 1,
                     "minimum": 0,
                     "title": "Validation Size",
@@ -91,6 +91,19 @@
                     "default": 0.5,
                     "description": "Set to float to prevent data leak between scoring and decision nodes.",
                     "title": "Separation Ratio"
+                },
+                "is_few_shot_train": {
+                    "default": false,
+                    "description": "Whether to use few-shot training.",
+                    "title": "Is Few Shot Train",
+                    "type": "boolean"
+                },
+                "examples_per_intent": {
+                    "default": 8,
+                    "description": "Number of examples per intent for few-shot validation. If None, all examples will be used.",
+                    "exclusiveMinimum": 0,
+                    "title": "Examples Per Intent",
+                    "type": "integer"
                 }
             },
             "title": "DataConfig",
@@ -362,7 +375,9 @@
                 "scheme": "ho",
                 "n_folds": 3,
                 "validation_size": 0.2,
-                "separation_ratio": 0.5
+                "separation_ratio": 0.5,
+                "is_few_shot_train": false,
+                "examples_per_intent": 8
             }
         },
         "search_space": {
diff --git a/tests/data/test_data_handler.py b/tests/data/test_data_handler.py
@@ -1,3 +1,5 @@
+from collections import Counter
+
 import pytest
 
 from autointent import Dataset
@@ -223,3 +225,20 @@ def test_cv_iterator(dataset):
         assert count_oos_labels(y_train) == specs["train"]["oos"]
         assert len(x_val) == len(y_val) == specs["val"]["total"]
         assert count_oos_labels(y_val) == specs["val"]["oos"]
+
+
+def test_few_shot_split(dataset):
+    dh = DataHandler(dataset, config=DataConfig(scheme="ho", is_few_shot_train=True, examples_per_intent=2))
+
+    desired_specs = {
+        "train_0": {0: 2, 1: 2, 2: 2, 3: 2},
+        "train_1": {2: 2, 0: 2, None: 2, 1: 1, 3: 1},
+        "validation_0": {0: 3, 1: 4, 2: 3, 3: 4},
+        "validation_1": {None: 14, 3: 1, 0: 1, 1: 1, 2: 1},
+        "test": {None: 4, 0: 2, 2: 2, 3: 2, 1: 2},
+    }
+
+    for data_split in dh.dataset:
+        assert (
+            Counter(dh.dataset[data_split][dh.dataset.label_feature]) == desired_specs[data_split]
+        ), f"Failed for {data_split}"
diff --git a/tests/data/test_stratificaiton.py b/tests/data/test_stratificaiton.py