Skip to content

Commit 7e979a8

Browse files
authored
feat: align and validate alignment rate (#2094)
First align the metric using an experimental data ```python insight_relevance.align(project=p,experiment_names=['modest_huffman'],model=ExperimentDataRow, embedding_model=embedding) ``` then check for alignment rate ```python result = insight_relevance.validate_alignment( llm=llm, gold_experiment=human_labelled_exp, ) ``` ``` (0.8571428571428572, 0.95) ```
1 parent daa8ca2 commit 7e979a8

File tree

8 files changed

+218
-35
lines changed

8 files changed

+218
-35
lines changed

experimental/ragas_experimental/__init__.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,12 @@
1111
except PackageNotFoundError:
1212
__version__ = "unknown"
1313

14-
import ragas_experimental.model.notion_typing as nmt
1514
from ragas_experimental.model.pydantic_model import (
1615
ExtendedPydanticBaseModel as BaseModel,
1716
)
1817

19-
from .model.notion_model import NotionModel
2018
from .project.core import Project
2119

2220
# Import the main Project class - decorators are added automatically in core.py
2321

24-
__all__ = ["Project", "NotionModel", "nmt", "BaseModel"]
22+
__all__ = ["Project", "BaseModel"]

experimental/ragas_experimental/dataset.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,3 +334,52 @@ def get(
334334
return self._backend.get_entry_by_field(field_name, field_value, self.model)
335335

336336
return None
337+
338+
def train_test_split(
339+
self, test_size: float = 0.2, random_state: t.Optional[int] = None
340+
) -> t.Tuple["Dataset[BaseModelType]", "Dataset[BaseModelType]"]:
341+
"""Split the dataset into training and testing sets.
342+
343+
Args:
344+
test_size: Proportion of the dataset to include in the test split (default: 0.2)
345+
random_state: Random seed for reproducibility (default: None)
346+
Returns:
347+
A tuple of two Datasets: (train_dataset, test_dataset)
348+
"""
349+
if not self._entries:
350+
self.load()
351+
352+
# Shuffle entries if random_state is set
353+
if random_state is not None:
354+
import random
355+
356+
random.seed(random_state)
357+
random.shuffle(self._entries)
358+
359+
# Calculate split index
360+
split_index = int(len(self._entries) * (1 - test_size))
361+
362+
# Create new dataset instances without full initialization
363+
train_dataset = object.__new__(type(self))
364+
test_dataset = object.__new__(type(self))
365+
366+
# Copy essential attributes
367+
for dataset in [train_dataset, test_dataset]:
368+
dataset.model = self.model
369+
dataset.project_id = self.project_id
370+
dataset._backend = self._backend
371+
dataset.backend_type = self.backend_type
372+
dataset.datatable_type = self.datatable_type
373+
374+
# Set specific attributes for each dataset
375+
train_dataset.name = f"{self.name}_train"
376+
train_dataset.dataset_id = f"{self.dataset_id}_train"
377+
378+
test_dataset.name = f"{self.name}_test"
379+
test_dataset.dataset_id = f"{self.dataset_id}_test"
380+
381+
# Assign entries to the new datasets
382+
train_dataset._entries = self._entries[:split_index]
383+
test_dataset._entries = self._entries[split_index:]
384+
385+
return train_dataset, test_dataset

experimental/ragas_experimental/metric/base.py

Lines changed: 107 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,21 @@
55
import asyncio
66
import string
77
import typing as t
8-
from abc import ABC
8+
from abc import ABC, abstractmethod
99
from dataclasses import dataclass, field
1010

11-
from pydantic import BaseModel
1211
from tqdm import tqdm
1312

1413
from ..embedding.base import BaseEmbedding
1514
from ..llm import RagasLLM
16-
from ..model.notion_model import NotionModel
1715
from ..prompt.base import Prompt
1816
from ..prompt.dynamic_few_shot import DynamicFewShotPrompt
1917
from .result import MetricResult
18+
from pydantic import BaseModel
2019

2120
if t.TYPE_CHECKING:
22-
from ragas_experimental.project.core import Project
21+
22+
from ragas_experimental.dataset import Dataset
2323

2424

2525
@dataclass
@@ -92,39 +92,114 @@ async def abatch_score(
9292
# Run all tasks concurrently and return results
9393
return await asyncio.gather(*async_tasks)
9494

95-
def train(
95+
@abstractmethod
96+
def get_correlation(self, gold_label, predictions) -> float:
97+
"""
98+
Calculate the correlation between gold scores and predicted scores.
99+
This is a placeholder method and should be implemented based on the specific metric.
100+
"""
101+
pass
102+
103+
def align_and_validate(
96104
self,
97-
project: "Project",
98-
experiment_names: t.List[str],
99-
model: NotionModel,
105+
dataset: "Dataset",
100106
embedding_model: BaseEmbedding,
101-
method: t.Dict[str, t.Any],
107+
llm: RagasLLM,
108+
test_size: float = 0.2,
109+
random_state: int = 42,
110+
**kwargs: t.Dict[str, t.Any],
102111
):
112+
"""
113+
Args:
114+
dataset: experiment to align the metric with.
115+
embedding_model: The embedding model used for dynamic few-shot prompting.
116+
llm: The LLM instance to use for scoring.
117+
118+
Align the metric with the specified experiments and validate it against a gold standard experiment.
119+
This method combines alignment and validation into a single step.
120+
"""
121+
train_dataset, test_dataset = dataset.train_test_split(
122+
test_size=test_size, random_state=random_state
123+
)
124+
125+
self.align(train_dataset, embedding_model, **kwargs)
126+
return self.validate_alignment(llm, test_dataset)
127+
128+
def align(
129+
self,
130+
dataset: "Dataset",
131+
embedding_model: BaseEmbedding,
132+
**kwargs: t.Dict[str, t.Any],
133+
):
134+
"""
135+
Args:
136+
experiment: experiment to align the metric with.
137+
model: The Pydantic model used for the experiment data.
138+
embedding_model: The embedding model used for dynamic few-shot prompting.
139+
140+
Align the metric with the specified experiments by different optimization methods.
141+
"""
103142

104143
assert isinstance(self.prompt, Prompt)
105-
self.prompt = DynamicFewShotPrompt.from_prompt(self.prompt, embedding_model)
106-
datasets = []
107-
for experiment_name in experiment_names:
108-
experiment_data = project.get_experiment(experiment_name, model)
109-
experiment_data.load()
110-
datasets.append(experiment_data)
111-
112-
total_items = sum([len(dataset) for dataset in datasets])
144+
self.prompt = DynamicFewShotPrompt.from_prompt(
145+
self.prompt, embedding_model, **kwargs
146+
)
147+
dataset.load()
148+
total_items = len(dataset)
113149
input_vars = self.get_variables()
114150
output_vars = [self.name, f"{self.name}_reason"]
115151
with tqdm(total=total_items, desc="Processing examples") as pbar:
116-
for dataset in datasets:
117-
for row in dataset:
118-
inputs = {
119-
var: getattr(row, var)
120-
for var in input_vars
121-
if hasattr(row, var)
122-
}
123-
output = {
124-
var: getattr(row, var)
125-
for var in output_vars
126-
if hasattr(row, var)
127-
}
128-
if output:
129-
self.prompt.add_example(inputs, output)
130-
pbar.update(1)
152+
for row in dataset:
153+
inputs = {
154+
var: getattr(row, var) for var in input_vars if hasattr(row, var)
155+
}
156+
output = {
157+
var: getattr(row, var) for var in output_vars if hasattr(row, var)
158+
}
159+
if output:
160+
self.prompt.add_example(inputs, output)
161+
pbar.update(1)
162+
163+
def validate_alignment(
164+
self,
165+
llm: RagasLLM,
166+
test_dataset: "Dataset",
167+
mapping: t.Dict[str, str] = {},
168+
):
169+
"""
170+
Args:
171+
llm: The LLM instance to use for scoring.
172+
test_dataset: An Dataset instance containing the gold standard scores.
173+
mapping: A dictionary mapping variable names expected by metrics to their corresponding names in the gold experiment.
174+
175+
Validate the alignment of the metric by comparing the scores against a gold standard experiment.
176+
This method computes the Cohen's Kappa score and agreement rate between the gold standard scores and
177+
the predicted scores from the metric.
178+
"""
179+
180+
test_dataset.load()
181+
gold_scores = [getattr(row, self.name) for row in test_dataset]
182+
pred_scores = []
183+
for row in tqdm(test_dataset):
184+
values = {
185+
v: (
186+
getattr(row, v)
187+
if v not in mapping
188+
else getattr(row, mapping.get(v, v))
189+
)
190+
for v in self.get_variables()
191+
}
192+
score = self.score(llm=llm, **values)
193+
pred_scores.append(score.result)
194+
195+
df = test_dataset.to_pandas()
196+
df[f"{self.name}_pred"] = pred_scores
197+
correlation = self.get_correlation(gold_scores, pred_scores)
198+
agreement_rate = sum(x == y for x, y in zip(gold_scores, pred_scores)) / len(
199+
gold_scores
200+
)
201+
return {
202+
"correlation": correlation,
203+
"agreement_rate": agreement_rate,
204+
"df": df,
205+
}

experimental/ragas_experimental/metric/discrete.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,21 @@ def __post_init__(self):
2222
"response_model", result=(t.Literal[values], ...), reason=(str, ...)
2323
)
2424

25+
def get_correlation(
26+
self, gold_labels: t.List[str], predictions: t.List[str]
27+
) -> float:
28+
"""
29+
Calculate the correlation between gold labels and predictions.
30+
This is a placeholder method and should be implemented based on the specific metric.
31+
"""
32+
try:
33+
from sklearn.metrics import cohen_kappa_score
34+
except ImportError:
35+
raise ImportError(
36+
"scikit-learn is required for correlation calculation. "
37+
"Please install it with `pip install scikit-learn`."
38+
)
39+
return cohen_kappa_score(gold_labels, predictions)
40+
2541

2642
discrete_metric = create_metric_decorator(DiscreteMetric)

experimental/ragas_experimental/metric/numeric.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,21 @@ def __post_init__(self):
1919
super().__post_init__()
2020
self._response_model = create_model("response_model", result=(float, ...))
2121

22+
def get_correlation(
23+
self, gold_labels: t.List[float], predictions: t.List[float]
24+
) -> float:
25+
"""
26+
Calculate the correlation between gold labels and predictions.
27+
This is a placeholder method and should be implemented based on the specific metric.
28+
"""
29+
try:
30+
from scipy.stats import pearsonr
31+
except ImportError:
32+
raise ImportError(
33+
"scipy is required for correlation calculation. "
34+
"Please install it with `pip install scipy`."
35+
)
36+
return pearsonr(gold_labels, predictions)[0]
37+
2238

2339
numeric_metric = create_metric_decorator(NumericMetric)

experimental/ragas_experimental/metric/ranking.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,27 @@ def __post_init__(self):
2323
reason=(str, Field(..., description="Reasoning for the ranking")),
2424
)
2525

26+
def get_correlation(
27+
self, gold_labels: t.List[str], predictions: t.List[str]
28+
) -> float:
29+
"""
30+
Calculate the correlation between gold labels and predictions.
31+
This is a placeholder method and should be implemented based on the specific metric.
32+
"""
33+
try:
34+
from sklearn.metrics import cohen_kappa_score
35+
except ImportError:
36+
raise ImportError(
37+
"scikit-learn is required for correlation calculation. "
38+
"Please install it with `pip install scikit-learn`."
39+
)
40+
41+
kappa_scores = []
42+
for gold_label, prediction in zip(gold_labels, predictions):
43+
kappa = cohen_kappa_score(gold_label, prediction, weights="quadratic")
44+
kappa_scores.append(kappa)
45+
46+
return sum(kappa_scores) / len(kappa_scores) if kappa_scores else 0.0
47+
2648

2749
ranking_metric = create_metric_decorator(RankingMetric)

experimental/tests/e2e/test_integration.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ class IntegrationMetric(Metric):
3030
def __post_init__(self):
3131
super().__post_init__()
3232
self._response_model = EvaluationResult
33+
34+
def get_correlation(self, gold_label, predictions) -> float:
35+
return super().get_correlation(gold_label, predictions)
3336

3437

3538
@pytest.fixture

experimental/tests/unit/test_metric_base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ class CustomMetric(Metric):
1919
def __post_init__(self):
2020
super().__post_init__()
2121
self._response_model = MetricResponseModel
22+
23+
def get_correlation(self, gold_labels: t.List[str], predictions: t.List[str]) -> float:
24+
25+
return 0.0 # Placeholder for correlation logic
2226

2327

2428
@pytest.fixture

0 commit comments

Comments
 (0)