some working version

Samoed · Samoed · commit 09521a739fb8 · 2025-03-02T23:26:57.000+03:00
diff --git a/autointent/generation/utterances/evolution/dspy_evolver.py b/autointent/generation/utterances/evolution/dspy_evolver.py
@@ -1,24 +1,26 @@
 """
 Evolutionary strategy to augmenting utterances.
-
-Deeply inspired by DeepEval evolutions.
 """
+
 import copy
-import os
+import logging
 import random
+from collections import Counter
 from pathlib import Path
 from typing import Any
 
 import dspy
-from datasets import Dataset as HFDataset, concatenate_datasets
-from dspy.evaluate import SemanticF1
-import logging
+from datasets import Dataset as HFDataset
+from datasets import concatenate_datasets
+
+# from dspy.evaluate import CompleteAndGrounded, SemanticF1, answer_exact_match
+from dspy.evaluate.auto_evaluation import f1_score
 
 from autointent import Dataset, Pipeline
 from autointent.custom_types import Split
 
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.DEBUG)
 
 SEARCH_SPACE = [
     {
@@ -42,31 +44,80 @@
 ]
 
 
-# Define a DSPy signature for text augmentation.
-class TextAugmentSignature(dspy.Signature):
-    text: str = dspy.InputField()
-    # n_examples: int = dspy.InputField()
-    augmented_texts: list[str] = dspy.OutputField(
-        desc="List of augmented texts that preserve the original meaning but use varied phrasing."
-    )
+def repetition_factor(true_text: str, augmented_text: str) -> float:
+    """
+    Calculate the average ROUGE-1 F1 score between pairs of texts in true_texts and augmented_texts.
+
+    ROUGE-1 F1 is computed as:
+        F1 = 2 * (precision * recall) / (precision + recall)
+    where:
+        - precision = (overlap in unigrams) / (total unigrams in augmented text)
+        - recall = (overlap in unigrams) / (total unigrams in true text)
+
+    Args:
+        true_text: A ground truth text.
+        augmented_text: A list of augmented/generated text.
+
+    Returns:
+        float: The average ROUGE-1 F1 score across all pairs.
+
+    Raises:
+        ValueError: If the lengths of true_texts and augmented_texts differ.
+    """
+    true_tokens = true_text.split()
+    aug_tokens = augmented_text.split()
+    if not true_tokens or not aug_tokens:
+        return 0.0
+    true_counts = Counter(true_tokens)
+    aug_counts = Counter(aug_tokens)
+    # Calculate the token overlap using the minimum count for common tokens
+    overlap = sum(min(true_counts[token], aug_counts[token]) for token in true_counts.keys() & aug_counts.keys())
+    precision = overlap / len(aug_tokens)
+    recall = overlap / len(true_tokens)
+    if precision + recall == 0:
+        f1 = 0.0
+    else:
+        f1 = 2 * precision * recall / (precision + recall)
+    return f1
+
+
+class SemanticRecallPrecision(dspy.Signature):
+    """
+    Compare a system's response to the ground truth to compute its recall and precision.
+    If asked to reason, enumerate key ideas in each response, and whether they are present in the other response.
+    """
+
+    # Copied from dspy
+
+    question: str = dspy.InputField()
+    ground_truth: str = dspy.InputField()
+    system_response: str = dspy.InputField()
+    recall: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
+    precision: float = dspy.OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
+
+
+class AugmentSemanticF1(dspy.Module):
+    # adapted SemanticF1
+    def __init__(self, threshold: float = 0.66, **kwargs: Any) -> None:
+        self.threshold = threshold
+        self.module = dspy.ChainOfThought(SemanticRecallPrecision)
+
+    def forward(
+        self, example: dspy.Example, pred: dspy.Prediction, trace: list[dspy.Prediction] | None = None
+    ) -> float | bool:
+        # Compute base scores using the existing semantic metric.
+        scores = self.module(question=example.question, ground_truth=example.response, system_response=pred.response)
+        base_score = f1_score(scores.precision, scores.recall)
+
+        # Compute repetition penalty factor.
+        penalty = repetition_factor(example.response, pred.response)
+
+        # Apply penalty to the base score.
+        final_score = base_score * penalty
+        # Return the final score, or a boolean based on the threshold if trace is provided.
+        return final_score if trace is None else final_score >= self.threshold
 
 
-# # Define a DSPy module that implements text augmentation.
-# class TextAugmenter(dspy.Module):
-#     def __init__(self) -> None:
-#         # Here, we use a ChainOfThought module with the defined signature.
-#         # The module is responsible for "thinking through" and generating multiple text variants.
-#         super().__init__()
-#         self.generator = dspy.ChainOfThought("text, n_examples -> augmented_texts")
-#
-#     def forward(self, text: str, n_examples: int) -> dspy.Prediction:
-#         # Invoke the underlying generator with the input text and desired number of examples.
-#         return self.generator(text=text, n_examples=n_examples)
-
-
-os.environ['MISTRAL_API_KEY'] = ""
-os.environ["OPENROUTER_API_KEY"] = ""
-
 class DSPYIncrementalUtteranceEvolver:
     """Incremental evolutionary strategy to augmenting utterances using DSPy."""
 
@@ -78,15 +129,17 @@ def __init__(
         """Initialize."""
         self.search_space = self._choose_search_space(search_space)
         random.seed(seed)
-        # full list of providers
+
         turbo = dspy.LM(
-            ...,
-            model_type='chat'
+            "hosted_vllm/x5-airun-medium-coder-prod",
+            api_base="http://mn-rtx01.x5.ru:8000/v1",
+            # api_key="test",
+            model_type="chat",
         )
         dspy.settings.configure(lm=turbo)
         # self.generator = dspy.ChainOfThought("text, n_examples -> augmented_texts: list[str]")
         # input should be question and response is augmented. question and response required for metric
-        self.generator = dspy.ChainOfThought("question -> response: list[str]")
+        self.generator = dspy.ChainOfThought("question -> response: str")
 
     def _choose_search_space(self, search_space: str | None) -> list[dict[str, Any]] | Path | str:
         if search_space is None:
@@ -113,43 +166,48 @@ def augment(
             dspy.Example(
                 question=sample[Dataset.utterance_feature],
                 # n_examples=1,
-                response=sample[Dataset.utterance_feature]  # Use original as reference
+                response=sample[Dataset.utterance_feature],  # Use original as reference
             ).with_inputs(
                 "question",
                 # "n_examples"
             )
             for sample in original_split
         ]
 
-        for _ in range(n_evolutions):
-            metric = SemanticF1()
+        for i in range(n_evolutions):
+            metric = AugmentSemanticF1()
 
             optimizer = dspy.MIPROv2(
-                metric=metric,
-                auto="medium",
-                num_threads=batch_size,
-                log_dir="logs",
+                metric=metric,  # SemanticF1
+                # auto="medium",  # can be low, medium, high. this setting will override params in compile
+                # num_threads=batch_size,
+                # log_dir="logs",
             )
             optimized_module = optimizer.compile(
                 self.generator,
                 trainset=dspy_dataset,
                 requires_permission_to_run=False,
-                max_bootstrapped_demos=4,
-                max_labeled_demos=4
+                minibatch=False,
+                # max_bootstrapped_demos=4,
+                # max_labeled_demos=4,
+                num_trials=5,
             )
+            # evaluate(optimized_module)
+            # try:
+            self.generator.save("generator/", save_program=True)
+            # should be dir + file *.json or *.pkl
+            self.generator.save("generator/generator_state.json", save_program=False)
+
+            optimized_module.save("optimized_module", save_program=True)
+            optimized_module.save("optimized_module/optimized_module.json", save_program=False)
+            # Generate new samples
             new_samples = []
             for sample in original_split:
                 utterance = sample[Dataset.utterance_feature]
                 label = sample[Dataset.label_feature]
-                prediction = optimized_module(text=utterance)
+                prediction = optimized_module(question=utterance)
                 new_samples.extend(
-                    [
-                        {
-                            Dataset.label_feature: label,
-                            Dataset.utterance_feature: ut
-                        }
-                        for ut in prediction.response
-                    ]
+                    [{Dataset.label_feature: label, Dataset.utterance_feature: ut} for ut in prediction.response]
                 )
 
             new_samples_dataset = HFDataset.from_list(new_samples)
@@ -178,8 +236,5 @@ def augment(
 
     # Example usage
     dataset = Dataset.from_hub("AutoIntent/clinc150_subset")
-    evolver = DSPYIncrementalUtteranceEvolver(
-        seed=42,
-        search_space=None
-    )
-    augmented_dataset = evolver.augment(dataset, split_name=Split.TEST, n_evolutions=5)
+    evolver = DSPYIncrementalUtteranceEvolver(seed=42, search_space=None)
+    augmented_dataset = evolver.augment(dataset, split_name=Split.TEST, n_evolutions=2)