Migrate Faithfullness (#2384)

rhlbhatnagar · web-flow · commit 09d22fc6f95d · 2025-10-30T23:31:44.000+05:30
diff --git a/src/ragas/metrics/collections/__init__.py b/src/ragas/metrics/collections/__init__.py
@@ -13,6 +13,7 @@
 )
 from ragas.metrics.collections._bleu_score import BleuScore
 from ragas.metrics.collections._context_entity_recall import ContextEntityRecall
+from ragas.metrics.collections._faithfulness import Faithfulness
 from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
 from ragas.metrics.collections._rouge_score import RougeScore
 from ragas.metrics.collections._semantic_similarity import SemanticSimilarity
@@ -36,6 +37,7 @@
     "ContextEntityRecall",
     "DistanceMeasure",
     "ExactMatch",
+    "Faithfulness",
     "NoiseSensitivity",
     "NonLLMStringSimilarity",
     "RougeScore",
diff --git a/src/ragas/metrics/collections/_answer_correctness.py b/src/ragas/metrics/collections/_answer_correctness.py
@@ -8,10 +8,8 @@
 
 from ragas.metrics.collections.base import BaseMetric
 from ragas.metrics.result import MetricResult
-from ragas.prompt.metrics.answer_correctness import (
-    correctness_classifier_prompt,
-    statement_generator_prompt,
-)
+from ragas.prompt.metrics.answer_correctness import correctness_classifier_prompt
+from ragas.prompt.metrics.common import statement_generator_prompt
 
 if t.TYPE_CHECKING:
     from ragas.embeddings.base import BaseRagasEmbedding
diff --git a/src/ragas/metrics/collections/_faithfulness.py b/src/ragas/metrics/collections/_faithfulness.py
@@ -0,0 +1,173 @@
+"""Faithfulness metric v2 - Modern implementation with function-based prompts."""
+
+import typing as t
+from typing import List
+
+from pydantic import BaseModel
+
+from ragas.metrics.collections.base import BaseMetric
+from ragas.metrics.result import MetricResult
+from ragas.prompt.metrics.common import nli_statement_prompt, statement_generator_prompt
+
+if t.TYPE_CHECKING:
+    from ragas.llms.base import InstructorBaseRagasLLM
+
+
+class StatementGeneratorOutput(BaseModel):
+    """Structured output for statement generation."""
+
+    statements: List[str]
+
+
+class StatementFaithfulnessAnswer(BaseModel):
+    """Individual statement with reason and verdict for NLI evaluation."""
+
+    statement: str
+    reason: str
+    verdict: int
+
+
+class NLIStatementOutput(BaseModel):
+    """Structured output for NLI statement evaluation."""
+
+    statements: List[StatementFaithfulnessAnswer]
+
+
+class Faithfulness(BaseMetric):
+    """
+    Modern v2 implementation of faithfulness evaluation.
+
+    Measures how factually consistent a response is with the retrieved context.
+    A response is considered faithful if all its claims can be supported by the context.
+
+    The metric works by:
+    1. Breaking down the response into atomic statements
+    2. Checking each statement against the retrieved contexts using NLI
+    3. Computing faithfulness as the ratio of supported statements
+
+    This implementation uses modern instructor LLMs with structured output.
+    Only supports modern components - legacy wrappers are rejected with clear error messages.
+
+    Usage:
+        >>> import instructor
+        >>> from openai import AsyncOpenAI
+        >>> from ragas.llms.base import instructor_llm_factory
+        >>> from ragas.metrics.collections import Faithfulness
+        >>>
+        >>> # Setup dependencies
+        >>> client = AsyncOpenAI()
+        >>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o-mini")
+        >>>
+        >>> # Create metric instance
+        >>> metric = Faithfulness(llm=llm)
+        >>>
+        >>> # Single evaluation
+        >>> result = await metric.ascore(
+        ...     user_input="Where was Einstein born?",
+        ...     response="Einstein was born in Germany on 14th March 1879.",
+        ...     retrieved_contexts=["Albert Einstein was born in Germany..."]
+        ... )
+        >>> print(f"Faithfulness Score: {result.value}")
+
+    Attributes:
+        llm: Modern instructor-based LLM for statement generation and NLI evaluation
+        name: The metric name
+        allowed_values: Score range (0.0 to 1.0, higher is better)
+    """
+
+    # Type hints for linter (attributes are set in __init__)
+    llm: "InstructorBaseRagasLLM"
+
+    def __init__(
+        self,
+        llm: "InstructorBaseRagasLLM",
+        name: str = "faithfulness",
+        **kwargs,
+    ):
+        """
+        Initialize Faithfulness metric with required components.
+
+        Args:
+            llm: Modern instructor-based LLM for statement generation and NLI evaluation
+            name: The metric name
+        """
+        # Set attributes explicitly before calling super()
+        self.llm = llm
+
+        # Call super() for validation (without passing llm in kwargs)
+        super().__init__(name=name, **kwargs)
+
+    async def ascore(
+        self, user_input: str, response: str, retrieved_contexts: List[str]
+    ) -> MetricResult:
+        """
+        Calculate faithfulness score.
+
+        Args:
+            user_input: The original question
+            response: The response to evaluate for faithfulness
+            retrieved_contexts: The retrieved contexts to check against
+
+        Returns:
+            MetricResult with faithfulness score (0.0-1.0, higher is better)
+        """
+        # Input validation
+        if not response:
+            raise ValueError(
+                "response is missing. Please add response to the test sample."
+            )
+        if not user_input:
+            raise ValueError(
+                "user_input is missing. Please add user_input to the test sample."
+            )
+        if not retrieved_contexts:
+            raise ValueError(
+                "retrieved_contexts is missing. Please add retrieved_contexts to the test sample."
+            )
+
+        # Step 1: Break response into atomic statements
+        statements = await self._create_statements(user_input, response)
+
+        if not statements:
+            # No statements generated - return NaN like legacy
+            return MetricResult(value=float("nan"))
+
+        # Step 2: Join all contexts and evaluate statements against them
+        context_str = "\n".join(retrieved_contexts)
+        verdicts = await self._create_verdicts(statements, context_str)
+
+        # Step 3: Compute faithfulness score
+        score = self._compute_score(verdicts)
+
+        return MetricResult(value=float(score))
+
+    async def _create_statements(self, question: str, response: str) -> List[str]:
+        """Break response into atomic statements using statement generator."""
+        prompt = statement_generator_prompt(question, response)
+        result = await self.llm.agenerate(prompt, StatementGeneratorOutput)
+        return result.statements
+
+    async def _create_verdicts(
+        self, statements: List[str], context: str
+    ) -> NLIStatementOutput:
+        """Evaluate statement faithfulness against context using NLI."""
+        prompt = nli_statement_prompt(context, statements)
+        result = await self.llm.agenerate(prompt, NLIStatementOutput)
+        return result
+
+    def _compute_score(self, verdicts: NLIStatementOutput) -> float:
+        """Compute faithfulness score as ratio of faithful statements."""
+        if not verdicts.statements:
+            return float("nan")
+
+        faithful_statements = sum(
+            1 if statement.verdict else 0 for statement in verdicts.statements
+        )
+        num_statements = len(verdicts.statements)
+
+        if num_statements > 0:
+            score = faithful_statements / num_statements
+        else:
+            score = float("nan")
+
+        return score
diff --git a/src/ragas/metrics/collections/_noise_sensitivity.py b/src/ragas/metrics/collections/_noise_sensitivity.py
@@ -8,8 +8,7 @@
 
 from ragas.metrics.collections.base import BaseMetric
 from ragas.metrics.result import MetricResult
-from ragas.prompt.metrics.answer_correctness import statement_generator_prompt
-from ragas.prompt.metrics.noise_sensitivity import nli_statement_prompt
+from ragas.prompt.metrics.common import nli_statement_prompt, statement_generator_prompt
 
 if t.TYPE_CHECKING:
     from ragas.llms.base import InstructorBaseRagasLLM
diff --git a/src/ragas/prompt/metrics/__init__.py b/src/ragas/prompt/metrics/__init__.py
@@ -1,13 +1,12 @@
 """Metric-specific prompts for Ragas evaluation metrics."""
 
-from ragas.prompt.metrics.answer_correctness import (
-    correctness_classifier_prompt,
-    statement_generator_prompt,
-)
+from ragas.prompt.metrics.answer_correctness import correctness_classifier_prompt
 from ragas.prompt.metrics.answer_relevance import answer_relevancy_prompt
+from ragas.prompt.metrics.common import nli_statement_prompt, statement_generator_prompt
 
 __all__ = [
     "answer_relevancy_prompt",
     "correctness_classifier_prompt",
+    "nli_statement_prompt",
     "statement_generator_prompt",
 ]
diff --git a/src/ragas/prompt/metrics/answer_correctness.py b/src/ragas/prompt/metrics/answer_correctness.py
@@ -1,70 +1,31 @@
-"""Answer Correctness prompts for statement generation and classification."""
+"""Answer Correctness prompts for classification.
+
+Note: statement_generator_prompt has been moved to ragas.prompt.metrics.common
+"""
 
 import json
 import typing as t
 
 
-def statement_generator_prompt(question: str, answer: str) -> str:
-    """
-    V1-identical statement generator - matches PydanticPrompt.to_string() exactly.
-
-    Args:
-        question: The question being answered
-        answer: The answer text to break down into statements
-
-    Returns:
-        V1-identical prompt string for the LLM
-    """
-    # Format inputs exactly like V1's model_dump_json(indent=4, exclude_none=True)
-    safe_question = json.dumps(question)
-    safe_answer = json.dumps(answer)
-
-    return f"""Given a question and an answer, analyze the complexity of each sentence in the answer. Break down each sentence into one or more fully understandable statements. Ensure that no pronouns are used in any statement. IMPORTANT: Extract statements EXACTLY as they appear in the answer - do not correct factual errors or change any content. Format the outputs in JSON.
-Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
-{{"properties": {{"statements": {{"description": "The generated statements", "items": {{"type": "string"}}, "title": "Statements", "type": "array"}}}}, "required": ["statements"], "title": "StatementGeneratorOutput", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash.
-
---------EXAMPLES-----------
-Example 1
-Input: {{
-    "question": "Who was Albert Einstein and what is he best known for?",
-    "answer": "He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics."
-}}
-Output: {{
-    "statements": [
-        "Albert Einstein was a German-born theoretical physicist.",
-        "Albert Einstein is recognized as one of the greatest and most influential physicists of all time.",
-        "Albert Einstein was best known for developing the theory of relativity.",
-        "Albert Einstein made important contributions to the development of the theory of quantum mechanics."
-    ]
-}}
------------------------------
-
-Now perform the same with the following input
-input: {{
-    "question": {safe_question},
-    "answer": {safe_answer}
-}}
-Output: """
-
-
 def correctness_classifier_prompt(
     question: str, answer_statements: t.List[str], ground_truth_statements: t.List[str]
 ) -> str:
     """
-    V1-compatible correctness classifier using exact PydanticPrompt structure.
+    V1-identical correctness classifier - matches PydanticPrompt.to_string() exactly.
 
     Args:
         question: The original question
-        answer_statements: List of statements from the answer
-        ground_truth_statements: List of statements from the ground truth
+        answer_statements: List of statements from the answer to evaluate
+        ground_truth_statements: List of ground truth reference statements
 
     Returns:
-        V1-compatible prompt string for the LLM
+        V1-identical prompt string for the LLM
     """
     # Format inputs exactly like V1's model_dump_json(indent=4, exclude_none=True)
     safe_question = json.dumps(question)
-    # Format lists with proper indentation to match V1's Pydantic formatting
-    safe_answer = json.dumps(answer_statements, indent=4).replace("\n", "\n    ")
+    safe_answer_statements = json.dumps(answer_statements, indent=4).replace(
+        "\n", "\n    "
+    )
     safe_ground_truth = json.dumps(ground_truth_statements, indent=4).replace(
         "\n", "\n    "
     )
@@ -157,7 +118,10 @@ def correctness_classifier_prompt(
 Now perform the same with the following input
 input: {{
     "question": {safe_question},
-    "answer": {safe_answer},
+    "answer": {safe_answer_statements},
     "ground_truth": {safe_ground_truth}
 }}
 Output: """
+
+
+__all__ = ["correctness_classifier_prompt"]
diff --git a/src/ragas/prompt/metrics/common.py b/src/ragas/prompt/metrics/common.py
diff --git a/tests/e2e/metrics_migration/test_answer_correctness_migration.py b/tests/e2e/metrics_migration/test_answer_correctness_migration.py
diff --git a/tests/e2e/metrics_migration/test_faithfulness_migration.py b/tests/e2e/metrics_migration/test_faithfulness_migration.py