feat: align and validate alignment rate (#2094)

shahules786 · web-flow · commit 7e979a825976 · 2025-06-30T20:51:03.000-07:00
First align the metric using an experimental data
```python
insight_relevance.align(project=p,experiment_names=['modest_huffman'],model=ExperimentDataRow, embedding_model=embedding)
```

then check for alignment rate
```python
result = insight_relevance.validate_alignment(
                llm=llm,
                gold_experiment=human_labelled_exp,
            )
```
```
(0.8571428571428572, 0.95)

```
diff --git a/experimental/ragas_experimental/__init__.py b/experimental/ragas_experimental/__init__.py
@@ -11,14 +11,12 @@
     except PackageNotFoundError:
         __version__ = "unknown"
 
-import ragas_experimental.model.notion_typing as nmt
 from ragas_experimental.model.pydantic_model import (
     ExtendedPydanticBaseModel as BaseModel,
 )
 
-from .model.notion_model import NotionModel
 from .project.core import Project
 
 # Import the main Project class - decorators are added automatically in core.py
 
-__all__ = ["Project", "NotionModel", "nmt", "BaseModel"]
+__all__ = ["Project", "BaseModel"]
diff --git a/experimental/ragas_experimental/dataset.py b/experimental/ragas_experimental/dataset.py
@@ -334,3 +334,52 @@ def get(
             return self._backend.get_entry_by_field(field_name, field_value, self.model)
 
         return None
+
+    def train_test_split(
+        self, test_size: float = 0.2, random_state: t.Optional[int] = None
+    ) -> t.Tuple["Dataset[BaseModelType]", "Dataset[BaseModelType]"]:
+        """Split the dataset into training and testing sets.
+
+        Args:
+            test_size: Proportion of the dataset to include in the test split (default: 0.2)
+            random_state: Random seed for reproducibility (default: None)
+        Returns:
+            A tuple of two Datasets: (train_dataset, test_dataset)
+        """
+        if not self._entries:
+            self.load()
+
+        # Shuffle entries if random_state is set
+        if random_state is not None:
+            import random
+
+            random.seed(random_state)
+            random.shuffle(self._entries)
+
+        # Calculate split index
+        split_index = int(len(self._entries) * (1 - test_size))
+
+        # Create new dataset instances without full initialization
+        train_dataset = object.__new__(type(self))
+        test_dataset = object.__new__(type(self))
+
+        # Copy essential attributes
+        for dataset in [train_dataset, test_dataset]:
+            dataset.model = self.model
+            dataset.project_id = self.project_id
+            dataset._backend = self._backend
+            dataset.backend_type = self.backend_type
+            dataset.datatable_type = self.datatable_type
+
+        # Set specific attributes for each dataset
+        train_dataset.name = f"{self.name}_train"
+        train_dataset.dataset_id = f"{self.dataset_id}_train"
+
+        test_dataset.name = f"{self.name}_test"
+        test_dataset.dataset_id = f"{self.dataset_id}_test"
+
+        # Assign entries to the new datasets
+        train_dataset._entries = self._entries[:split_index]
+        test_dataset._entries = self._entries[split_index:]
+
+        return train_dataset, test_dataset
diff --git a/experimental/ragas_experimental/metric/base.py b/experimental/ragas_experimental/metric/base.py
@@ -5,21 +5,21 @@
 import asyncio
 import string
 import typing as t
-from abc import ABC
+from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 
-from pydantic import BaseModel
 from tqdm import tqdm
 
 from ..embedding.base import BaseEmbedding
 from ..llm import RagasLLM
-from ..model.notion_model import NotionModel
 from ..prompt.base import Prompt
 from ..prompt.dynamic_few_shot import DynamicFewShotPrompt
 from .result import MetricResult
+from pydantic import BaseModel
 
 if t.TYPE_CHECKING:
-    from ragas_experimental.project.core import Project
+
+    from ragas_experimental.dataset import Dataset
 
 
 @dataclass
@@ -92,39 +92,114 @@ async def abatch_score(
         # Run all tasks concurrently and return results
         return await asyncio.gather(*async_tasks)
 
-    def train(
+    @abstractmethod
+    def get_correlation(self, gold_label, predictions) -> float:
+        """
+        Calculate the correlation between gold scores and predicted scores.
+        This is a placeholder method and should be implemented based on the specific metric.
+        """
+        pass
+
+    def align_and_validate(
         self,
-        project: "Project",
-        experiment_names: t.List[str],
-        model: NotionModel,
+        dataset: "Dataset",
         embedding_model: BaseEmbedding,
-        method: t.Dict[str, t.Any],
+        llm: RagasLLM,
+        test_size: float = 0.2,
+        random_state: int = 42,
+        **kwargs: t.Dict[str, t.Any],
     ):
+        """
+        Args:
+            dataset: experiment to align the metric with.
+            embedding_model: The embedding model used for dynamic few-shot prompting.
+            llm: The LLM instance to use for scoring.
+
+        Align the metric with the specified experiments and validate it against a gold standard experiment.
+        This method combines alignment and validation into a single step.
+        """
+        train_dataset, test_dataset = dataset.train_test_split(
+            test_size=test_size, random_state=random_state
+        )
+
+        self.align(train_dataset, embedding_model, **kwargs)
+        return self.validate_alignment(llm, test_dataset)
+
+    def align(
+        self,
+        dataset: "Dataset",
+        embedding_model: BaseEmbedding,
+        **kwargs: t.Dict[str, t.Any],
+    ):
+        """
+        Args:
+            experiment: experiment to align the metric with.
+            model: The Pydantic model used for the experiment data.
+            embedding_model: The embedding model used for dynamic few-shot prompting.
+
+        Align the metric with the specified experiments by different optimization methods.
+        """
 
         assert isinstance(self.prompt, Prompt)
-        self.prompt = DynamicFewShotPrompt.from_prompt(self.prompt, embedding_model)
-        datasets = []
-        for experiment_name in experiment_names:
-            experiment_data = project.get_experiment(experiment_name, model)
-            experiment_data.load()
-            datasets.append(experiment_data)
-
-        total_items = sum([len(dataset) for dataset in datasets])
+        self.prompt = DynamicFewShotPrompt.from_prompt(
+            self.prompt, embedding_model, **kwargs
+        )
+        dataset.load()
+        total_items = len(dataset)
         input_vars = self.get_variables()
         output_vars = [self.name, f"{self.name}_reason"]
         with tqdm(total=total_items, desc="Processing examples") as pbar:
-            for dataset in datasets:
-                for row in dataset:
-                    inputs = {
-                        var: getattr(row, var)
-                        for var in input_vars
-                        if hasattr(row, var)
-                    }
-                    output = {
-                        var: getattr(row, var)
-                        for var in output_vars
-                        if hasattr(row, var)
-                    }
-                    if output:
-                        self.prompt.add_example(inputs, output)
-                    pbar.update(1)
+            for row in dataset:
+                inputs = {
+                    var: getattr(row, var) for var in input_vars if hasattr(row, var)
+                }
+                output = {
+                    var: getattr(row, var) for var in output_vars if hasattr(row, var)
+                }
+                if output:
+                    self.prompt.add_example(inputs, output)
+                pbar.update(1)
+
+    def validate_alignment(
+        self,
+        llm: RagasLLM,
+        test_dataset: "Dataset",
+        mapping: t.Dict[str, str] = {},
+    ):
+        """
+        Args:
+            llm: The LLM instance to use for scoring.
+            test_dataset: An Dataset instance containing the gold standard scores.
+            mapping: A dictionary mapping variable names expected by metrics to their corresponding names in the gold experiment.
+
+        Validate the alignment of the metric by comparing the scores against a gold standard experiment.
+        This method computes the Cohen's Kappa score and agreement rate between the gold standard scores and
+        the predicted scores from the metric.
+        """
+
+        test_dataset.load()
+        gold_scores = [getattr(row, self.name) for row in test_dataset]
+        pred_scores = []
+        for row in tqdm(test_dataset):
+            values = {
+                v: (
+                    getattr(row, v)
+                    if v not in mapping
+                    else getattr(row, mapping.get(v, v))
+                )
+                for v in self.get_variables()
+            }
+            score = self.score(llm=llm, **values)
+            pred_scores.append(score.result)
+
+        df = test_dataset.to_pandas()
+        df[f"{self.name}_pred"] = pred_scores
+        correlation = self.get_correlation(gold_scores, pred_scores)
+        agreement_rate = sum(x == y for x, y in zip(gold_scores, pred_scores)) / len(
+            gold_scores
+        )
+        return {
+            "correlation": correlation,
+            "agreement_rate": agreement_rate,
+            "df": df,
+        }
diff --git a/experimental/ragas_experimental/metric/discrete.py b/experimental/ragas_experimental/metric/discrete.py
@@ -22,5 +22,21 @@ def __post_init__(self):
             "response_model", result=(t.Literal[values], ...), reason=(str, ...)
         )
 
+    def get_correlation(
+        self, gold_labels: t.List[str], predictions: t.List[str]
+    ) -> float:
+        """
+        Calculate the correlation between gold labels and predictions.
+        This is a placeholder method and should be implemented based on the specific metric.
+        """
+        try:
+            from sklearn.metrics import cohen_kappa_score
+        except ImportError:
+            raise ImportError(
+                "scikit-learn is required for correlation calculation. "
+                "Please install it with `pip install scikit-learn`."
+            )
+        return cohen_kappa_score(gold_labels, predictions)
+
 
 discrete_metric = create_metric_decorator(DiscreteMetric)
diff --git a/experimental/ragas_experimental/metric/numeric.py b/experimental/ragas_experimental/metric/numeric.py
@@ -19,5 +19,21 @@ def __post_init__(self):
         super().__post_init__()
         self._response_model = create_model("response_model", result=(float, ...))
 
+    def get_correlation(
+        self, gold_labels: t.List[float], predictions: t.List[float]
+    ) -> float:
+        """
+        Calculate the correlation between gold labels and predictions.
+        This is a placeholder method and should be implemented based on the specific metric.
+        """
+        try:
+            from scipy.stats import pearsonr
+        except ImportError:
+            raise ImportError(
+                "scipy is required for correlation calculation. "
+                "Please install it with `pip install scipy`."
+            )
+        return pearsonr(gold_labels, predictions)[0]
+
 
 numeric_metric = create_metric_decorator(NumericMetric)
diff --git a/experimental/ragas_experimental/metric/ranking.py b/experimental/ragas_experimental/metric/ranking.py
@@ -23,5 +23,27 @@ def __post_init__(self):
             reason=(str, Field(..., description="Reasoning for the ranking")),
         )
 
+    def get_correlation(
+        self, gold_labels: t.List[str], predictions: t.List[str]
+    ) -> float:
+        """
+        Calculate the correlation between gold labels and predictions.
+        This is a placeholder method and should be implemented based on the specific metric.
+        """
+        try:
+            from sklearn.metrics import cohen_kappa_score
+        except ImportError:
+            raise ImportError(
+                "scikit-learn is required for correlation calculation. "
+                "Please install it with `pip install scikit-learn`."
+            )
+
+        kappa_scores = []
+        for gold_label, prediction in zip(gold_labels, predictions):
+            kappa = cohen_kappa_score(gold_label, prediction, weights="quadratic")
+            kappa_scores.append(kappa)
+
+        return sum(kappa_scores) / len(kappa_scores) if kappa_scores else 0.0
+
 
 ranking_metric = create_metric_decorator(RankingMetric)
diff --git a/experimental/tests/e2e/test_integration.py b/experimental/tests/e2e/test_integration.py
@@ -30,6 +30,9 @@ class IntegrationMetric(Metric):
     def __post_init__(self):
         super().__post_init__()
         self._response_model = EvaluationResult
+        
+    def get_correlation(self, gold_label, predictions) -> float:
+        return super().get_correlation(gold_label, predictions)
 
 
 @pytest.fixture
diff --git a/experimental/tests/unit/test_metric_base.py b/experimental/tests/unit/test_metric_base.py
@@ -19,6 +19,10 @@ class CustomMetric(Metric):
     def __post_init__(self):
         super().__post_init__()
         self._response_model = MetricResponseModel
+        
+    def get_correlation(self, gold_labels: t.List[str], predictions: t.List[str]) -> float:
+        
+        return 0.0  # Placeholder for correlation logic
 
 
 @pytest.fixture