|
5 | 5 | import asyncio |
6 | 6 | import string |
7 | 7 | import typing as t |
8 | | -from abc import ABC |
| 8 | +from abc import ABC, abstractmethod |
9 | 9 | from dataclasses import dataclass, field |
10 | 10 |
|
11 | | -from pydantic import BaseModel |
12 | 11 | from tqdm import tqdm |
13 | 12 |
|
14 | 13 | from ..embedding.base import BaseEmbedding |
15 | 14 | from ..llm import RagasLLM |
16 | | -from ..model.notion_model import NotionModel |
17 | 15 | from ..prompt.base import Prompt |
18 | 16 | from ..prompt.dynamic_few_shot import DynamicFewShotPrompt |
19 | 17 | from .result import MetricResult |
| 18 | +from pydantic import BaseModel |
20 | 19 |
|
21 | 20 | if t.TYPE_CHECKING: |
22 | | - from ragas_experimental.project.core import Project |
| 21 | + |
| 22 | + from ragas_experimental.dataset import Dataset |
23 | 23 |
|
24 | 24 |
|
25 | 25 | @dataclass |
@@ -92,39 +92,114 @@ async def abatch_score( |
92 | 92 | # Run all tasks concurrently and return results |
93 | 93 | return await asyncio.gather(*async_tasks) |
94 | 94 |
|
95 | | - def train( |
| 95 | + @abstractmethod |
| 96 | + def get_correlation(self, gold_label, predictions) -> float: |
| 97 | + """ |
| 98 | + Calculate the correlation between gold scores and predicted scores. |
| 99 | + This is a placeholder method and should be implemented based on the specific metric. |
| 100 | + """ |
| 101 | + pass |
| 102 | + |
| 103 | + def align_and_validate( |
96 | 104 | self, |
97 | | - project: "Project", |
98 | | - experiment_names: t.List[str], |
99 | | - model: NotionModel, |
| 105 | + dataset: "Dataset", |
100 | 106 | embedding_model: BaseEmbedding, |
101 | | - method: t.Dict[str, t.Any], |
| 107 | + llm: RagasLLM, |
| 108 | + test_size: float = 0.2, |
| 109 | + random_state: int = 42, |
| 110 | + **kwargs: t.Dict[str, t.Any], |
102 | 111 | ): |
| 112 | + """ |
| 113 | + Args: |
| 114 | + dataset: experiment to align the metric with. |
| 115 | + embedding_model: The embedding model used for dynamic few-shot prompting. |
| 116 | + llm: The LLM instance to use for scoring. |
| 117 | +
|
| 118 | + Align the metric with the specified experiments and validate it against a gold standard experiment. |
| 119 | + This method combines alignment and validation into a single step. |
| 120 | + """ |
| 121 | + train_dataset, test_dataset = dataset.train_test_split( |
| 122 | + test_size=test_size, random_state=random_state |
| 123 | + ) |
| 124 | + |
| 125 | + self.align(train_dataset, embedding_model, **kwargs) |
| 126 | + return self.validate_alignment(llm, test_dataset) |
| 127 | + |
| 128 | + def align( |
| 129 | + self, |
| 130 | + dataset: "Dataset", |
| 131 | + embedding_model: BaseEmbedding, |
| 132 | + **kwargs: t.Dict[str, t.Any], |
| 133 | + ): |
| 134 | + """ |
| 135 | + Args: |
| 136 | + experiment: experiment to align the metric with. |
| 137 | + model: The Pydantic model used for the experiment data. |
| 138 | + embedding_model: The embedding model used for dynamic few-shot prompting. |
| 139 | +
|
| 140 | + Align the metric with the specified experiments by different optimization methods. |
| 141 | + """ |
103 | 142 |
|
104 | 143 | assert isinstance(self.prompt, Prompt) |
105 | | - self.prompt = DynamicFewShotPrompt.from_prompt(self.prompt, embedding_model) |
106 | | - datasets = [] |
107 | | - for experiment_name in experiment_names: |
108 | | - experiment_data = project.get_experiment(experiment_name, model) |
109 | | - experiment_data.load() |
110 | | - datasets.append(experiment_data) |
111 | | - |
112 | | - total_items = sum([len(dataset) for dataset in datasets]) |
| 144 | + self.prompt = DynamicFewShotPrompt.from_prompt( |
| 145 | + self.prompt, embedding_model, **kwargs |
| 146 | + ) |
| 147 | + dataset.load() |
| 148 | + total_items = len(dataset) |
113 | 149 | input_vars = self.get_variables() |
114 | 150 | output_vars = [self.name, f"{self.name}_reason"] |
115 | 151 | with tqdm(total=total_items, desc="Processing examples") as pbar: |
116 | | - for dataset in datasets: |
117 | | - for row in dataset: |
118 | | - inputs = { |
119 | | - var: getattr(row, var) |
120 | | - for var in input_vars |
121 | | - if hasattr(row, var) |
122 | | - } |
123 | | - output = { |
124 | | - var: getattr(row, var) |
125 | | - for var in output_vars |
126 | | - if hasattr(row, var) |
127 | | - } |
128 | | - if output: |
129 | | - self.prompt.add_example(inputs, output) |
130 | | - pbar.update(1) |
| 152 | + for row in dataset: |
| 153 | + inputs = { |
| 154 | + var: getattr(row, var) for var in input_vars if hasattr(row, var) |
| 155 | + } |
| 156 | + output = { |
| 157 | + var: getattr(row, var) for var in output_vars if hasattr(row, var) |
| 158 | + } |
| 159 | + if output: |
| 160 | + self.prompt.add_example(inputs, output) |
| 161 | + pbar.update(1) |
| 162 | + |
| 163 | + def validate_alignment( |
| 164 | + self, |
| 165 | + llm: RagasLLM, |
| 166 | + test_dataset: "Dataset", |
| 167 | + mapping: t.Dict[str, str] = {}, |
| 168 | + ): |
| 169 | + """ |
| 170 | + Args: |
| 171 | + llm: The LLM instance to use for scoring. |
| 172 | + test_dataset: An Dataset instance containing the gold standard scores. |
| 173 | + mapping: A dictionary mapping variable names expected by metrics to their corresponding names in the gold experiment. |
| 174 | +
|
| 175 | + Validate the alignment of the metric by comparing the scores against a gold standard experiment. |
| 176 | + This method computes the Cohen's Kappa score and agreement rate between the gold standard scores and |
| 177 | + the predicted scores from the metric. |
| 178 | + """ |
| 179 | + |
| 180 | + test_dataset.load() |
| 181 | + gold_scores = [getattr(row, self.name) for row in test_dataset] |
| 182 | + pred_scores = [] |
| 183 | + for row in tqdm(test_dataset): |
| 184 | + values = { |
| 185 | + v: ( |
| 186 | + getattr(row, v) |
| 187 | + if v not in mapping |
| 188 | + else getattr(row, mapping.get(v, v)) |
| 189 | + ) |
| 190 | + for v in self.get_variables() |
| 191 | + } |
| 192 | + score = self.score(llm=llm, **values) |
| 193 | + pred_scores.append(score.result) |
| 194 | + |
| 195 | + df = test_dataset.to_pandas() |
| 196 | + df[f"{self.name}_pred"] = pred_scores |
| 197 | + correlation = self.get_correlation(gold_scores, pred_scores) |
| 198 | + agreement_rate = sum(x == y for x, y in zip(gold_scores, pred_scores)) / len( |
| 199 | + gold_scores |
| 200 | + ) |
| 201 | + return { |
| 202 | + "correlation": correlation, |
| 203 | + "agreement_rate": agreement_rate, |
| 204 | + "df": df, |
| 205 | + } |
0 commit comments