metrics: add domain specific rubrics based scoring (#1189)

vaishakhRaveendran · shahules786 · web-flow · commit 5c1f9a2afbd3 · 2024-08-15T09:06:05.000+05:30
This can be regarded as a next step refining on aspect critic metric. Ie
LEVEL 1: user evaluated using just simple criteria
**LEVEL 2: the user specifies criteria associated with each score for
the entire dataset**
LEVEL 3: user specifies individual criteria associated with each score
for each sample in the dataset

```python
from ragas import evaluate
from datasets import Dataset, DatasetDict

from ragas.metrics import reference_free_rubrics_score, labelled_rubrics_score
rows = {
    "question": [
        "What's the longest river in the world?",
        "What does the Democratic Republic of Congo flag represent?"
    ],
    "ground_truth": [
        "The Nile is a major north-flowing river in northeastern Africa. It flows into the Mediterranean Sea. The Nile is the longest river in Africa and has historically been considered the longest river in the world, though this has been contested by research suggesting that the Amazon River is slightly longer. Of the world's major rivers, the Nile is one of the smallest, as measured by annual flow in cubic metres of water. About 6,650 km (4,130 mi) long, its drainage basin covers eleven countries: the Democratic Republic of the Congo, Tanzania, Burundi, Rwanda, Uganda, Kenya, Ethiopia, Eritrea, South Sudan, Sudan, and Egypt.",
        "The national flag of the Democratic Republic of the Congo represents blue for peace, red for 'the blood of the country's martyrs', yellow for the country's wealth, and a star for a radiant future for the country."
    ],
    "answer": [
        "The longest river in the world is the Nile, stretching approximately 6,650 kilometers (4,130 miles) through northeastern Africa, flowing through countries such as Uganda, Sudan, and Egypt before emptying into the Mediterranean Sea. There is some debate about this title, as recent studies suggest the Amazon River could be longer if its longest tributaries are included, potentially extending its length to about 7,000 kilometers (4,350 miles).",
        "The flag of the Democratic Republic of the Congo (DRC) features a sky blue field with a red diagonal stripe bordered by narrow yellow edges, and a yellow five-pointed star in the upper left corner. Each element on the flag carries specific symbolism: the blue represents peace, the red symbolizes the blood of the country's martyrs, the yellow denotes the nation's wealth, and the star stands for hope for a better future."
    ],
    "contexts": [
        [
            "Scientists debate whether the Amazon or the Nile is the longest river in the world. Traditionally, the Nile is considered longer, but recent information suggests that the Amazon may be longer.",
            "The Nile River was central to the Ancient Egyptians' rise to wealth and power. Since rainfall is almost non-existent in Egypt, the Nile River and its yearly floodwaters offered the people a fertile oasis for rich agriculture.",
            "The world's longest rivers are defined as the longest natural streams whose water flows within a channel, or streambed, with defined banks.",
            "The Amazon River could be considered longer if its longest tributaries are included, potentially extending its length to about 7,000 kilometers."
        ],
        [
            "The flag of the second Republic of Mobutu Sese Seko became the official banner after Mobutu established his dictatorship. This flag was used from 1966 to 1971 and consisted of the same yellow star, now made smaller, situated in the top corner of the hoist side, with a red, yellow-lined band running diagonally across the center. The red symbolized the people's blood; the yellow symbolized prosperity; the blue symbolized hope; and the star represented unity.",
            "The current flag of the Democratic Republic of Congo, which has been adopted after the approval of a new constitution in 2006, is composed of a blue sheet, red diagonal stripe and a yellow five-pointed star at the top of the left part of the flag. Blue symbolizes peace, red stands for blood of martyrs, yellow color that frames the red stripe denotes prosperity and the star represents hope for a brighter future of the country.",
            "The blue color in the flag symbolizes peace, the red should remind of the country’s martyrs, the yellow is for the country’s riches and the star represents the future."
        ]
    ]
}



dataset = Dataset.from_dict(rows)

result = evaluate(
    dataset,
    metrics=[
        reference_free_rubrics_score,
        labelled_rubrics_score
    ],
)



```

---------

Co-authored-by: Shahules786 &lt;Shahules786@gmail.com&gt;
diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py
@@ -13,6 +13,12 @@
 )
 from ragas.metrics._context_recall import ContextRecall, context_recall
 from ragas.metrics._faithfulness import Faithfulness, FaithulnesswithHHEM, faithfulness
+from ragas.metrics._rubrics_based import (
+    LabelledRubricsScore,
+    ReferenceFreeRubricsScore,
+    labelled_rubrics_score,
+    reference_free_rubrics_score,
+)
 from ragas.metrics._summarization import SummarizationScore, summarization_score
 from ragas.metrics.critique import AspectCritique
 
@@ -37,4 +43,8 @@
     "context_entity_recall",
     "SummarizationScore",
     "summarization_score",
+    "labelled_rubrics_score",
+    "reference_free_rubrics_score",
+    "ReferenceFreeRubricsScore",
+    "LabelledRubricsScore",
 ]
diff --git a/src/ragas/metrics/_rubrics_based.py b/src/ragas/metrics/_rubrics_based.py
@@ -0,0 +1,188 @@
+from __future__ import annotations
+
+import typing as t
+from dataclasses import dataclass, field
+
+import numpy as np
+from langchain_core.pydantic_v1 import BaseModel, Field
+
+from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions
+from ragas.llms.prompt import Prompt
+from ragas.metrics.base import EvaluationMode, MetricWithLLM
+
+if t.TYPE_CHECKING:
+    from langchain_core.callbacks import Callbacks
+
+    from ragas.llms.prompt import PromptValue
+
+
+class ScoreFeedback(BaseModel):
+    feedback: str = Field(..., description="The feedback for the response")
+    score: int = Field(..., description="The score given to the response")
+
+
+class ScoreFeedbackAnswers(BaseModel):
+    __root__: t.List[ScoreFeedback]
+
+    def dicts(self) -> t.List[t.Dict]:
+        return self.dict()["__root__"]
+
+
+_score_feedback_output_instructions = get_json_format_instructions(ScoreFeedbackAnswers)
+_score_feedback_output_parser = RagasoutputParser(pydantic_object=ScoreFeedbackAnswers)
+
+DEFAULT_REFERENCE_FREE_RUBRICS = {
+    "score1_description": "The response is incorrect or does not answer the question.",
+    "score2_description": "The response is partially correct but may include errors or incomplete information.",
+    "score3_description": "The response is generally correct but lacks clarity or completeness.",
+    "score4_description": "The response is correct and clear, with minor issues or missing details.",
+    "score5_description": "The response is completely accurate, clear, and answers the question directly.",
+}
+
+
+DEFAULT_WITH_REFERENCE_RUBRICS = {
+    "score1_description": "The response is incorrect, irrelevant, or does not align with the ground truth.",
+    "score2_description": "The response partially matches the ground truth but includes significant errors, omissions, or irrelevant information.",
+    "score3_description": "The response generally aligns with the ground truth but may lack detail, clarity, or have minor inaccuracies.",
+    "score4_description": "The response is mostly accurate and aligns well with the ground truth, with only minor issues or missing details.",
+    "score5_description": "The response is fully accurate, aligns completely with the ground truth, and is clear and detailed.",
+}
+
+
+WITH_REFERENCE_SCORING_PROMPT = Prompt(
+    name="prometheus_score",
+    output_format_instruction=_score_feedback_output_instructions,
+    instruction="""Given an question (which might contain an input along with it), a answer to evaluate, a ground_truth answer that gets a score of 5, and a score rubric representing evaluation criteria are given.
+1. Write detailed feedback that assesses the quality of the answer strictly based on the given score rubric, without evaluating in general.
+2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric.""",
+    examples=[
+        {
+            "question": "What is the capital of France?",
+            "answer": "The capital of France is Paris.",
+            "ground_truth": "The capital of France is Paris.",
+            "rubrics": DEFAULT_WITH_REFERENCE_RUBRICS,
+            "analysis": ScoreFeedbackAnswers.parse_obj(
+                [
+                    {
+                        "feedback": """The response is completely accurate and directly answers the question about the capital of France. It matches the reference answer perfectly and does not contain any errors or omissions. Given the rubric, this response deserves the highest score as it meets all the criteria for accuracy and clarity.""",
+                        "score": 5,
+                    }
+                ]
+            ).dicts(),
+        }
+    ],
+    input_keys=["question", "answer", "ground_truth", "rubrics"],
+    output_key="analysis",
+    language="english",
+)
+
+
+WITHOUT_REFERENCE_SCORING_PROMPT = Prompt(
+    name="prometheus_score",
+    output_format_instruction=_score_feedback_output_instructions,
+    instruction="""Given an question (which might contain an input along with it), a answer to evaluate, and a score rubric representing evaluation criteria are given.
+1. Write detailed feedback that assesses the quality of the answer strictly based on the given score rubric, without evaluating in general.
+2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric.""",
+    examples=[
+        {
+            "question": "What is the capital of France?",
+            "answer": "The capital of France is Paris.",
+            "rubrics": DEFAULT_REFERENCE_FREE_RUBRICS,
+            "analysis": ScoreFeedbackAnswers.parse_obj(
+                [
+                    {
+                        "feedback": """The response is completely accurate and directly answers the question about the capital of France. It matches the reference answer perfectly and does not contain any errors or omissions. Given the rubric, this response deserves the highest score as it meets all the criteria for accuracy and clarity.""",
+                        "score": 5,
+                    }
+                ]
+            ).dicts(),
+        }
+    ],
+    input_keys=["question", "answer", "rubrics"],
+    output_key="analysis",
+    language="english",
+)
+
+
+@dataclass
+class LabelledRubricsScore(MetricWithLLM):
+    name: str = "labelled_rubrics_score"  # type: ignore
+    evaluation_mode: EvaluationMode = EvaluationMode.qcg  # type: ignore
+    rubrics: t.Dict[str, str] = field(
+        default_factory=lambda: DEFAULT_WITH_REFERENCE_RUBRICS
+    )
+    scoring_prompt: Prompt = field(
+        default_factory=lambda: WITH_REFERENCE_SCORING_PROMPT
+    )
+    max_retries: int = 1
+
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
+        assert self.llm is not None, "LLM is not set"
+
+        prompt_value = self._create_prompt(row)
+
+        response = await self.llm.generate(prompt_value, callbacks=callbacks)
+
+        parsed_response = await _score_feedback_output_parser.aparse(
+            response.generations[0][0].text, prompt_value, self.llm, self.max_retries
+        )
+
+        if parsed_response is None:
+            return np.nan
+
+        score = parsed_response.dicts()[0]["score"]
+        return score
+
+    def _create_prompt(self, row: t.Dict) -> PromptValue:
+        question, contexts, answer, ground_truth = (
+            row["question"],
+            row["contexts"],
+            row["answer"],
+            row["ground_truth"],
+        )
+        contexts = "\n".join(contexts)
+        question = f"{question} answer using context: {contexts}"
+        return self.scoring_prompt.format(
+            question=question,
+            answer=answer,
+            ground_truth=ground_truth,
+            rubrics=self.rubrics,
+        )
+
+    def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None:
+        assert self.llm is not None, "LLM must be set to adapt the metric"
+        self.scoring_prompt.adapt(language, self.llm, cache_dir)
+
+    def save(self, cache_dir: t.Optional[str] = None) -> None:
+        self.scoring_prompt.save(cache_dir)
+
+
+@dataclass
+class ReferenceFreeRubricsScore(LabelledRubricsScore):
+    name: str = "reference_free_rubrics_score"  # type: ignore
+    evaluation_mode: EvaluationMode = EvaluationMode.qga  # type: ignore
+    rubrics: t.Dict[str, str] = field(
+        default_factory=lambda: DEFAULT_REFERENCE_FREE_RUBRICS
+    )
+    scoring_prompt: Prompt = field(
+        default_factory=lambda: WITHOUT_REFERENCE_SCORING_PROMPT
+    )
+    max_retries: int = 1
+
+    def _create_prompt(self, row: t.Dict) -> PromptValue:
+        question, contexts, answer = (
+            row["question"],
+            row["contexts"],
+            row["answer"],
+        )
+        contexts = "\n".join(contexts)
+        question = f"{question} answer using context: {contexts}"
+        return self.scoring_prompt.format(
+            question=question,
+            answer=answer,
+            rubrics=self.rubrics,
+        )
+
+
+labelled_rubrics_score = LabelledRubricsScore()
+reference_free_rubrics_score = ReferenceFreeRubricsScore()