|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import typing as t |
| 4 | +from dataclasses import dataclass, field |
| 5 | + |
| 6 | +import numpy as np |
| 7 | +from langchain_core.pydantic_v1 import BaseModel, Field |
| 8 | + |
| 9 | +from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions |
| 10 | +from ragas.llms.prompt import Prompt |
| 11 | +from ragas.metrics.base import EvaluationMode, MetricWithLLM |
| 12 | + |
| 13 | +if t.TYPE_CHECKING: |
| 14 | + from langchain_core.callbacks import Callbacks |
| 15 | + |
| 16 | + from ragas.llms.prompt import PromptValue |
| 17 | + |
| 18 | + |
| 19 | +class ScoreFeedback(BaseModel): |
| 20 | + feedback: str = Field(..., description="The feedback for the response") |
| 21 | + score: int = Field(..., description="The score given to the response") |
| 22 | + |
| 23 | + |
| 24 | +class ScoreFeedbackAnswers(BaseModel): |
| 25 | + __root__: t.List[ScoreFeedback] |
| 26 | + |
| 27 | + def dicts(self) -> t.List[t.Dict]: |
| 28 | + return self.dict()["__root__"] |
| 29 | + |
| 30 | + |
| 31 | +_score_feedback_output_instructions = get_json_format_instructions(ScoreFeedbackAnswers) |
| 32 | +_score_feedback_output_parser = RagasoutputParser(pydantic_object=ScoreFeedbackAnswers) |
| 33 | + |
| 34 | +DEFAULT_REFERENCE_FREE_RUBRICS = { |
| 35 | + "score1_description": "The response is incorrect or does not answer the question.", |
| 36 | + "score2_description": "The response is partially correct but may include errors or incomplete information.", |
| 37 | + "score3_description": "The response is generally correct but lacks clarity or completeness.", |
| 38 | + "score4_description": "The response is correct and clear, with minor issues or missing details.", |
| 39 | + "score5_description": "The response is completely accurate, clear, and answers the question directly.", |
| 40 | +} |
| 41 | + |
| 42 | + |
| 43 | +DEFAULT_WITH_REFERENCE_RUBRICS = { |
| 44 | + "score1_description": "The response is incorrect, irrelevant, or does not align with the ground truth.", |
| 45 | + "score2_description": "The response partially matches the ground truth but includes significant errors, omissions, or irrelevant information.", |
| 46 | + "score3_description": "The response generally aligns with the ground truth but may lack detail, clarity, or have minor inaccuracies.", |
| 47 | + "score4_description": "The response is mostly accurate and aligns well with the ground truth, with only minor issues or missing details.", |
| 48 | + "score5_description": "The response is fully accurate, aligns completely with the ground truth, and is clear and detailed.", |
| 49 | +} |
| 50 | + |
| 51 | + |
| 52 | +WITH_REFERENCE_SCORING_PROMPT = Prompt( |
| 53 | + name="prometheus_score", |
| 54 | + output_format_instruction=_score_feedback_output_instructions, |
| 55 | + instruction="""Given an question (which might contain an input along with it), a answer to evaluate, a ground_truth answer that gets a score of 5, and a score rubric representing evaluation criteria are given. |
| 56 | +1. Write detailed feedback that assesses the quality of the answer strictly based on the given score rubric, without evaluating in general. |
| 57 | +2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric.""", |
| 58 | + examples=[ |
| 59 | + { |
| 60 | + "question": "What is the capital of France?", |
| 61 | + "answer": "The capital of France is Paris.", |
| 62 | + "ground_truth": "The capital of France is Paris.", |
| 63 | + "rubrics": DEFAULT_WITH_REFERENCE_RUBRICS, |
| 64 | + "analysis": ScoreFeedbackAnswers.parse_obj( |
| 65 | + [ |
| 66 | + { |
| 67 | + "feedback": """The response is completely accurate and directly answers the question about the capital of France. It matches the reference answer perfectly and does not contain any errors or omissions. Given the rubric, this response deserves the highest score as it meets all the criteria for accuracy and clarity.""", |
| 68 | + "score": 5, |
| 69 | + } |
| 70 | + ] |
| 71 | + ).dicts(), |
| 72 | + } |
| 73 | + ], |
| 74 | + input_keys=["question", "answer", "ground_truth", "rubrics"], |
| 75 | + output_key="analysis", |
| 76 | + language="english", |
| 77 | +) |
| 78 | + |
| 79 | + |
| 80 | +WITHOUT_REFERENCE_SCORING_PROMPT = Prompt( |
| 81 | + name="prometheus_score", |
| 82 | + output_format_instruction=_score_feedback_output_instructions, |
| 83 | + instruction="""Given an question (which might contain an input along with it), a answer to evaluate, and a score rubric representing evaluation criteria are given. |
| 84 | +1. Write detailed feedback that assesses the quality of the answer strictly based on the given score rubric, without evaluating in general. |
| 85 | +2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric.""", |
| 86 | + examples=[ |
| 87 | + { |
| 88 | + "question": "What is the capital of France?", |
| 89 | + "answer": "The capital of France is Paris.", |
| 90 | + "rubrics": DEFAULT_REFERENCE_FREE_RUBRICS, |
| 91 | + "analysis": ScoreFeedbackAnswers.parse_obj( |
| 92 | + [ |
| 93 | + { |
| 94 | + "feedback": """The response is completely accurate and directly answers the question about the capital of France. It matches the reference answer perfectly and does not contain any errors or omissions. Given the rubric, this response deserves the highest score as it meets all the criteria for accuracy and clarity.""", |
| 95 | + "score": 5, |
| 96 | + } |
| 97 | + ] |
| 98 | + ).dicts(), |
| 99 | + } |
| 100 | + ], |
| 101 | + input_keys=["question", "answer", "rubrics"], |
| 102 | + output_key="analysis", |
| 103 | + language="english", |
| 104 | +) |
| 105 | + |
| 106 | + |
| 107 | +@dataclass |
| 108 | +class LabelledRubricsScore(MetricWithLLM): |
| 109 | + name: str = "labelled_rubrics_score" # type: ignore |
| 110 | + evaluation_mode: EvaluationMode = EvaluationMode.qcg # type: ignore |
| 111 | + rubrics: t.Dict[str, str] = field( |
| 112 | + default_factory=lambda: DEFAULT_WITH_REFERENCE_RUBRICS |
| 113 | + ) |
| 114 | + scoring_prompt: Prompt = field( |
| 115 | + default_factory=lambda: WITH_REFERENCE_SCORING_PROMPT |
| 116 | + ) |
| 117 | + max_retries: int = 1 |
| 118 | + |
| 119 | + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: |
| 120 | + assert self.llm is not None, "LLM is not set" |
| 121 | + |
| 122 | + prompt_value = self._create_prompt(row) |
| 123 | + |
| 124 | + response = await self.llm.generate(prompt_value, callbacks=callbacks) |
| 125 | + |
| 126 | + parsed_response = await _score_feedback_output_parser.aparse( |
| 127 | + response.generations[0][0].text, prompt_value, self.llm, self.max_retries |
| 128 | + ) |
| 129 | + |
| 130 | + if parsed_response is None: |
| 131 | + return np.nan |
| 132 | + |
| 133 | + score = parsed_response.dicts()[0]["score"] |
| 134 | + return score |
| 135 | + |
| 136 | + def _create_prompt(self, row: t.Dict) -> PromptValue: |
| 137 | + question, contexts, answer, ground_truth = ( |
| 138 | + row["question"], |
| 139 | + row["contexts"], |
| 140 | + row["answer"], |
| 141 | + row["ground_truth"], |
| 142 | + ) |
| 143 | + contexts = "\n".join(contexts) |
| 144 | + question = f"{question} answer using context: {contexts}" |
| 145 | + return self.scoring_prompt.format( |
| 146 | + question=question, |
| 147 | + answer=answer, |
| 148 | + ground_truth=ground_truth, |
| 149 | + rubrics=self.rubrics, |
| 150 | + ) |
| 151 | + |
| 152 | + def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: |
| 153 | + assert self.llm is not None, "LLM must be set to adapt the metric" |
| 154 | + self.scoring_prompt.adapt(language, self.llm, cache_dir) |
| 155 | + |
| 156 | + def save(self, cache_dir: t.Optional[str] = None) -> None: |
| 157 | + self.scoring_prompt.save(cache_dir) |
| 158 | + |
| 159 | + |
| 160 | +@dataclass |
| 161 | +class ReferenceFreeRubricsScore(LabelledRubricsScore): |
| 162 | + name: str = "reference_free_rubrics_score" # type: ignore |
| 163 | + evaluation_mode: EvaluationMode = EvaluationMode.qga # type: ignore |
| 164 | + rubrics: t.Dict[str, str] = field( |
| 165 | + default_factory=lambda: DEFAULT_REFERENCE_FREE_RUBRICS |
| 166 | + ) |
| 167 | + scoring_prompt: Prompt = field( |
| 168 | + default_factory=lambda: WITHOUT_REFERENCE_SCORING_PROMPT |
| 169 | + ) |
| 170 | + max_retries: int = 1 |
| 171 | + |
| 172 | + def _create_prompt(self, row: t.Dict) -> PromptValue: |
| 173 | + question, contexts, answer = ( |
| 174 | + row["question"], |
| 175 | + row["contexts"], |
| 176 | + row["answer"], |
| 177 | + ) |
| 178 | + contexts = "\n".join(contexts) |
| 179 | + question = f"{question} answer using context: {contexts}" |
| 180 | + return self.scoring_prompt.format( |
| 181 | + question=question, |
| 182 | + answer=answer, |
| 183 | + rubrics=self.rubrics, |
| 184 | + ) |
| 185 | + |
| 186 | + |
| 187 | +labelled_rubrics_score = LabelledRubricsScore() |
| 188 | +reference_free_rubrics_score = ReferenceFreeRubricsScore() |
0 commit comments