Migrate noise sensitivity (#2379)

rhlbhatnagar · web-flow · commit cf7e8a7df4b9 · 2025-10-30T01:40:04.000+05:30
diff --git a/src/ragas/metrics/collections/__init__.py b/src/ragas/metrics/collections/__init__.py
@@ -13,6 +13,7 @@
 )
 from ragas.metrics.collections._bleu_score import BleuScore
 from ragas.metrics.collections._context_entity_recall import ContextEntityRecall
+from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
 from ragas.metrics.collections._rouge_score import RougeScore
 from ragas.metrics.collections._semantic_similarity import SemanticSimilarity
 from ragas.metrics.collections._simple_criteria import SimpleCriteria
@@ -35,6 +36,7 @@
     "ContextEntityRecall",
     "DistanceMeasure",
     "ExactMatch",
+    "NoiseSensitivity",
     "NonLLMStringSimilarity",
     "RougeScore",
     "SemanticSimilarity",
diff --git a/src/ragas/metrics/collections/_noise_sensitivity.py b/src/ragas/metrics/collections/_noise_sensitivity.py
@@ -0,0 +1,244 @@
+"""Noise Sensitivity metric v2 - Modern implementation with function-based prompts."""
+
+import typing as t
+from typing import Dict, List, Literal
+
+import numpy as np
+from pydantic import BaseModel
+
+from ragas.metrics.collections.base import BaseMetric
+from ragas.metrics.result import MetricResult
+from ragas.prompt.metrics.answer_correctness import statement_generator_prompt
+from ragas.prompt.metrics.noise_sensitivity import nli_statement_prompt
+
+if t.TYPE_CHECKING:
+    from ragas.llms.base import InstructorBaseRagasLLM
+
+
+class StatementGeneratorOutput(BaseModel):
+    """Structured output for statement generation."""
+
+    statements: List[str]
+
+
+class StatementFaithfulnessAnswer(BaseModel):
+    """Individual statement with reason and verdict for NLI evaluation."""
+
+    statement: str
+    reason: str
+    verdict: int
+
+
+class NLIStatementOutput(BaseModel):
+    """Structured output for NLI statement evaluation."""
+
+    statements: List[StatementFaithfulnessAnswer]
+
+
+class NoiseSensitivity(BaseMetric):
+    """
+    Modern v2 implementation of noise sensitivity evaluation.
+
+    Measures how often a system makes errors by providing incorrect responses
+    when utilizing either relevant or irrelevant retrieved documents.
+
+    The metric works by:
+    1. Decomposing reference and response into atomic statements
+    2. Using NLI to evaluate statement faithfulness against each retrieved context
+    3. Computing noise sensitivity based on incorrect claims from relevant/irrelevant contexts
+
+    This implementation uses modern instructor LLMs with structured output.
+    Only supports modern components - legacy wrappers are rejected with clear error messages.
+
+    Usage:
+        >>> import instructor
+        >>> from openai import AsyncOpenAI
+        >>> from ragas.llms.base import instructor_llm_factory
+        >>> from ragas.metrics.collections import NoiseSensitivity
+        >>>
+        >>> # Setup dependencies
+        >>> client = AsyncOpenAI()
+        >>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o-mini")
+        >>>
+        >>> # Create metric instance
+        >>> metric = NoiseSensitivity(llm=llm)
+        >>>
+        >>> # Single evaluation
+        >>> result = await metric.ascore(
+        ...     user_input="What is LIC known for?",
+        ...     response="LIC is the largest insurance company in India...",
+        ...     reference="LIC is known for managing investments...",
+        ...     retrieved_contexts=["LIC was established in 1956...", ...]
+        ... )
+        >>> print(f"Noise Sensitivity: {result.value}")
+        >>>
+        >>> # Test irrelevant context sensitivity
+        >>> irrelevant_metric = NoiseSensitivity(llm=llm, mode="irrelevant")
+
+    Attributes:
+        llm: Modern instructor-based LLM for statement generation and NLI evaluation
+        name: The metric name
+        mode: Either "relevant" or "irrelevant" context sensitivity
+        allowed_values: Score range (0.0 to 1.0, lower is better)
+    """
+
+    # Type hints for linter (attributes are set in __init__)
+    llm: "InstructorBaseRagasLLM"
+
+    def __init__(
+        self,
+        llm: "InstructorBaseRagasLLM",
+        name: str = "noise_sensitivity",
+        mode: Literal["relevant", "irrelevant"] = "relevant",
+        **kwargs,
+    ):
+        """
+        Initialize NoiseSensitivity metric with required components.
+
+        Args:
+            llm: Modern instructor-based LLM for statement generation and NLI evaluation
+            name: The metric name
+            mode: Either "relevant" or "irrelevant" context sensitivity mode
+        """
+        # Set attributes explicitly before calling super()
+        self.llm = llm
+        self.mode = mode
+
+        # Validate mode
+        if mode not in {"relevant", "irrelevant"}:
+            raise ValueError(
+                f"Invalid argument passed for 'mode': {mode}. Must be 'relevant' or 'irrelevant'."
+            )
+
+        # Call super() for validation (without passing llm in kwargs)
+        super().__init__(name=name, **kwargs)
+
+    async def ascore(
+        self,
+        user_input: str,
+        response: str,
+        reference: str,
+        retrieved_contexts: List[str],
+    ) -> MetricResult:
+        """
+        Calculate noise sensitivity score.
+
+        Args:
+            user_input: The original question
+            response: The answer to evaluate
+            reference: The ground truth reference
+            retrieved_contexts: The retrieved contexts used to generate the response
+
+        Returns:
+            MetricResult with noise sensitivity score (0.0-1.0, lower is better)
+        """
+        # Input validation
+        if not reference:
+            raise ValueError(
+                "reference is missing. Please add reference to the test sample."
+            )
+        if not user_input:
+            raise ValueError(
+                "user_input is missing. Please add user_input to the test sample."
+            )
+        if not response:
+            raise ValueError(
+                "response is missing. Please add response to the test sample."
+            )
+        if not retrieved_contexts:
+            raise ValueError(
+                "retrieved_contexts is missing. Please add retrieved_contexts to the test sample."
+            )
+
+        # Step 1: Decompose reference and response into statements
+        gt_statements = await self._decompose_answer_into_statements(
+            reference, user_input
+        )
+        ans_statements = await self._decompose_answer_into_statements(
+            response, user_input
+        )
+
+        # Step 2: Evaluate statement faithfulness against each retrieved context
+        gt_verdictslist = []
+        ans_verdictslist = []
+
+        for ctx in retrieved_contexts:
+            # Evaluate ground truth statements against this context
+            gt_verdicts = await self._evaluate_statement_faithfulness(
+                gt_statements, ctx
+            )
+            gt_verdictslist.append(np.array(gt_verdicts))
+
+            # Evaluate answer statements against this context
+            ans_verdicts = await self._evaluate_statement_faithfulness(
+                ans_statements, ctx
+            )
+            ans_verdictslist.append(np.array(ans_verdicts))
+
+        # Step 3: Build matrices for computation (exact legacy shape handling)
+        answers = {}
+        answers["retrieved2ground_truth"] = np.array(gt_verdictslist).T
+        answers["retrieved2answer"] = np.array(ans_verdictslist).T
+
+        # Evaluate answer statements against reference (ground truth)
+        gt_to_ans_verdicts = await self._evaluate_statement_faithfulness(
+            ans_statements, reference
+        )
+        answers["ground_truth2answer"] = np.array(gt_to_ans_verdicts)
+        # Wrap in another array to match legacy shape handling
+        answers["ground_truth2answer"] = np.array([answers["ground_truth2answer"]])
+
+        # Convert all to boolean arrays
+        answers = {k: v.astype(bool) for k, v in answers.items()}
+
+        # Step 4: Compute noise sensitivity score
+        score = self._compute_score(answers)
+
+        return MetricResult(value=float(score))
+
+    async def _decompose_answer_into_statements(
+        self, text: str, question: str
+    ) -> List[str]:
+        """Decompose answer text into atomic statements."""
+        prompt = statement_generator_prompt(question, text)
+        result = await self.llm.agenerate(prompt, StatementGeneratorOutput)
+        return result.statements
+
+    async def _evaluate_statement_faithfulness(
+        self, statements: List[str], context: str
+    ) -> List[int]:
+        """Evaluate faithfulness of statements against context using NLI."""
+        prompt = nli_statement_prompt(context, statements)
+        result = await self.llm.agenerate(prompt, NLIStatementOutput)
+
+        verdict_list = [
+            1 if statement.verdict else 0 for statement in result.statements
+        ]
+        return verdict_list
+
+    def _compute_score(self, answers: Dict) -> float:
+        """Compute noise sensitivity score from faithfulness matrices."""
+        incorrect = ~answers["ground_truth2answer"]
+
+        # Compute relevant retrievals (needed for both modes)
+        relevant_retrieved = np.max(
+            answers["retrieved2ground_truth"], axis=0, keepdims=True
+        )
+        relevant_faithful = np.max(
+            relevant_retrieved & answers["retrieved2answer"], axis=1
+        )
+
+        if self.mode == "irrelevant":
+            # Compute irrelevant retrievals
+            irrelevant_retrieved = ~relevant_retrieved
+            irrelevant_faithful = np.max(
+                irrelevant_retrieved & answers["retrieved2answer"], axis=1
+            )
+
+            # Keep them exclusive (irrelevant should not include relevant)
+            irrelevant_faithful &= ~relevant_faithful
+
+            return float(np.mean(irrelevant_faithful & incorrect))
+
+        else:  # mode == "relevant"
+            return float(np.mean(relevant_faithful & incorrect))
diff --git a/src/ragas/prompt/metrics/noise_sensitivity.py b/src/ragas/prompt/metrics/noise_sensitivity.py
@@ -0,0 +1,85 @@
+"""Noise Sensitivity prompts - V1-identical using exact PydanticPrompt.to_string() output."""
+
+import json
+import typing as t
+
+
+def nli_statement_prompt(context: str, statements: t.List[str]) -> str:
+    """
+    V1-identical NLI statement evaluation - matches PydanticPrompt.to_string() exactly.
+
+    Args:
+        context: The context to evaluate statements against
+        statements: The statements to judge for faithfulness
+
+    Returns:
+        V1-identical prompt string for the LLM
+    """
+    # Format inputs exactly like V1's model_dump_json(indent=4, exclude_none=True)
+    safe_context = json.dumps(context)
+    safe_statements = json.dumps(statements, indent=4).replace("\n", "\n    ")
+
+    return f"""Your task is to judge the faithfulness of a series of statements based on a given context. For each statement you must return verdict as 1 if the statement can be directly inferred based on the context or 0 if the statement can not be directly inferred based on the context.
+Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
+{{"$defs": {{"StatementFaithfulnessAnswer": {{"properties": {{"statement": {{"description": "the original statement, word-by-word", "title": "Statement", "type": "string"}}, "reason": {{"description": "the reason of the verdict", "title": "Reason", "type": "string"}}, "verdict": {{"description": "the verdict(0/1) of the faithfulness.", "title": "Verdict", "type": "integer"}}}}, "required": ["statement", "reason", "verdict"], "title": "StatementFaithfulnessAnswer", "type": "object"}}}}, "properties": {{"statements": {{"items": {{"$ref": "#/$defs/StatementFaithfulnessAnswer"}}, "title": "Statements", "type": "array"}}}}, "required": ["statements"], "title": "NLIStatementOutput", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash.
+
+--------EXAMPLES-----------
+Example 1
+Input: {{
+    "context": "John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.",
+    "statements": [
+        "John is majoring in Biology.",
+        "John is taking a course on Artificial Intelligence.",
+        "John is a dedicated student.",
+        "John has a part-time job."
+    ]
+}}
+Output: {{
+    "statements": [
+        {{
+            "statement": "John is majoring in Biology.",
+            "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.",
+            "verdict": 0
+        }},
+        {{
+            "statement": "John is taking a course on Artificial Intelligence.",
+            "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.",
+            "verdict": 0
+        }},
+        {{
+            "statement": "John is a dedicated student.",
+            "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.",
+            "verdict": 1
+        }},
+        {{
+            "statement": "John has a part-time job.",
+            "reason": "There is no information given in the context about John having a part-time job.",
+            "verdict": 0
+        }}
+    ]
+}}
+
+Example 2
+Input: {{
+    "context": "Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.",
+    "statements": [
+        "Albert Einstein was a genius."
+    ]
+}}
+Output: {{
+    "statements": [
+        {{
+            "statement": "Albert Einstein was a genius.",
+            "reason": "The context and statement are unrelated",
+            "verdict": 0
+        }}
+    ]
+}}
+-----------------------------
+
+Now perform the same with the following input
+input: {{
+    "context": {safe_context},
+    "statements": {safe_statements}
+}}
+Output: """
diff --git a/tests/e2e/metrics_migration/test_noise_sensitivity_migration.py b/tests/e2e/metrics_migration/test_noise_sensitivity_migration.py