Skip to content

Commit cf7e8a7

Browse files
authored
Migrate noise sensitivity (#2379)
1 parent 35e884b commit cf7e8a7

File tree

4 files changed

+528
-0
lines changed

4 files changed

+528
-0
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
)
1414
from ragas.metrics.collections._bleu_score import BleuScore
1515
from ragas.metrics.collections._context_entity_recall import ContextEntityRecall
16+
from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
1617
from ragas.metrics.collections._rouge_score import RougeScore
1718
from ragas.metrics.collections._semantic_similarity import SemanticSimilarity
1819
from ragas.metrics.collections._simple_criteria import SimpleCriteria
@@ -35,6 +36,7 @@
3536
"ContextEntityRecall",
3637
"DistanceMeasure",
3738
"ExactMatch",
39+
"NoiseSensitivity",
3840
"NonLLMStringSimilarity",
3941
"RougeScore",
4042
"SemanticSimilarity",
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
"""Noise Sensitivity metric v2 - Modern implementation with function-based prompts."""
2+
3+
import typing as t
4+
from typing import Dict, List, Literal
5+
6+
import numpy as np
7+
from pydantic import BaseModel
8+
9+
from ragas.metrics.collections.base import BaseMetric
10+
from ragas.metrics.result import MetricResult
11+
from ragas.prompt.metrics.answer_correctness import statement_generator_prompt
12+
from ragas.prompt.metrics.noise_sensitivity import nli_statement_prompt
13+
14+
if t.TYPE_CHECKING:
15+
from ragas.llms.base import InstructorBaseRagasLLM
16+
17+
18+
class StatementGeneratorOutput(BaseModel):
19+
"""Structured output for statement generation."""
20+
21+
statements: List[str]
22+
23+
24+
class StatementFaithfulnessAnswer(BaseModel):
25+
"""Individual statement with reason and verdict for NLI evaluation."""
26+
27+
statement: str
28+
reason: str
29+
verdict: int
30+
31+
32+
class NLIStatementOutput(BaseModel):
33+
"""Structured output for NLI statement evaluation."""
34+
35+
statements: List[StatementFaithfulnessAnswer]
36+
37+
38+
class NoiseSensitivity(BaseMetric):
39+
"""
40+
Modern v2 implementation of noise sensitivity evaluation.
41+
42+
Measures how often a system makes errors by providing incorrect responses
43+
when utilizing either relevant or irrelevant retrieved documents.
44+
45+
The metric works by:
46+
1. Decomposing reference and response into atomic statements
47+
2. Using NLI to evaluate statement faithfulness against each retrieved context
48+
3. Computing noise sensitivity based on incorrect claims from relevant/irrelevant contexts
49+
50+
This implementation uses modern instructor LLMs with structured output.
51+
Only supports modern components - legacy wrappers are rejected with clear error messages.
52+
53+
Usage:
54+
>>> import instructor
55+
>>> from openai import AsyncOpenAI
56+
>>> from ragas.llms.base import instructor_llm_factory
57+
>>> from ragas.metrics.collections import NoiseSensitivity
58+
>>>
59+
>>> # Setup dependencies
60+
>>> client = AsyncOpenAI()
61+
>>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o-mini")
62+
>>>
63+
>>> # Create metric instance
64+
>>> metric = NoiseSensitivity(llm=llm)
65+
>>>
66+
>>> # Single evaluation
67+
>>> result = await metric.ascore(
68+
... user_input="What is LIC known for?",
69+
... response="LIC is the largest insurance company in India...",
70+
... reference="LIC is known for managing investments...",
71+
... retrieved_contexts=["LIC was established in 1956...", ...]
72+
... )
73+
>>> print(f"Noise Sensitivity: {result.value}")
74+
>>>
75+
>>> # Test irrelevant context sensitivity
76+
>>> irrelevant_metric = NoiseSensitivity(llm=llm, mode="irrelevant")
77+
78+
Attributes:
79+
llm: Modern instructor-based LLM for statement generation and NLI evaluation
80+
name: The metric name
81+
mode: Either "relevant" or "irrelevant" context sensitivity
82+
allowed_values: Score range (0.0 to 1.0, lower is better)
83+
"""
84+
85+
# Type hints for linter (attributes are set in __init__)
86+
llm: "InstructorBaseRagasLLM"
87+
88+
def __init__(
89+
self,
90+
llm: "InstructorBaseRagasLLM",
91+
name: str = "noise_sensitivity",
92+
mode: Literal["relevant", "irrelevant"] = "relevant",
93+
**kwargs,
94+
):
95+
"""
96+
Initialize NoiseSensitivity metric with required components.
97+
98+
Args:
99+
llm: Modern instructor-based LLM for statement generation and NLI evaluation
100+
name: The metric name
101+
mode: Either "relevant" or "irrelevant" context sensitivity mode
102+
"""
103+
# Set attributes explicitly before calling super()
104+
self.llm = llm
105+
self.mode = mode
106+
107+
# Validate mode
108+
if mode not in {"relevant", "irrelevant"}:
109+
raise ValueError(
110+
f"Invalid argument passed for 'mode': {mode}. Must be 'relevant' or 'irrelevant'."
111+
)
112+
113+
# Call super() for validation (without passing llm in kwargs)
114+
super().__init__(name=name, **kwargs)
115+
116+
async def ascore(
117+
self,
118+
user_input: str,
119+
response: str,
120+
reference: str,
121+
retrieved_contexts: List[str],
122+
) -> MetricResult:
123+
"""
124+
Calculate noise sensitivity score.
125+
126+
Args:
127+
user_input: The original question
128+
response: The answer to evaluate
129+
reference: The ground truth reference
130+
retrieved_contexts: The retrieved contexts used to generate the response
131+
132+
Returns:
133+
MetricResult with noise sensitivity score (0.0-1.0, lower is better)
134+
"""
135+
# Input validation
136+
if not reference:
137+
raise ValueError(
138+
"reference is missing. Please add reference to the test sample."
139+
)
140+
if not user_input:
141+
raise ValueError(
142+
"user_input is missing. Please add user_input to the test sample."
143+
)
144+
if not response:
145+
raise ValueError(
146+
"response is missing. Please add response to the test sample."
147+
)
148+
if not retrieved_contexts:
149+
raise ValueError(
150+
"retrieved_contexts is missing. Please add retrieved_contexts to the test sample."
151+
)
152+
153+
# Step 1: Decompose reference and response into statements
154+
gt_statements = await self._decompose_answer_into_statements(
155+
reference, user_input
156+
)
157+
ans_statements = await self._decompose_answer_into_statements(
158+
response, user_input
159+
)
160+
161+
# Step 2: Evaluate statement faithfulness against each retrieved context
162+
gt_verdictslist = []
163+
ans_verdictslist = []
164+
165+
for ctx in retrieved_contexts:
166+
# Evaluate ground truth statements against this context
167+
gt_verdicts = await self._evaluate_statement_faithfulness(
168+
gt_statements, ctx
169+
)
170+
gt_verdictslist.append(np.array(gt_verdicts))
171+
172+
# Evaluate answer statements against this context
173+
ans_verdicts = await self._evaluate_statement_faithfulness(
174+
ans_statements, ctx
175+
)
176+
ans_verdictslist.append(np.array(ans_verdicts))
177+
178+
# Step 3: Build matrices for computation (exact legacy shape handling)
179+
answers = {}
180+
answers["retrieved2ground_truth"] = np.array(gt_verdictslist).T
181+
answers["retrieved2answer"] = np.array(ans_verdictslist).T
182+
183+
# Evaluate answer statements against reference (ground truth)
184+
gt_to_ans_verdicts = await self._evaluate_statement_faithfulness(
185+
ans_statements, reference
186+
)
187+
answers["ground_truth2answer"] = np.array(gt_to_ans_verdicts)
188+
# Wrap in another array to match legacy shape handling
189+
answers["ground_truth2answer"] = np.array([answers["ground_truth2answer"]])
190+
191+
# Convert all to boolean arrays
192+
answers = {k: v.astype(bool) for k, v in answers.items()}
193+
194+
# Step 4: Compute noise sensitivity score
195+
score = self._compute_score(answers)
196+
197+
return MetricResult(value=float(score))
198+
199+
async def _decompose_answer_into_statements(
200+
self, text: str, question: str
201+
) -> List[str]:
202+
"""Decompose answer text into atomic statements."""
203+
prompt = statement_generator_prompt(question, text)
204+
result = await self.llm.agenerate(prompt, StatementGeneratorOutput)
205+
return result.statements
206+
207+
async def _evaluate_statement_faithfulness(
208+
self, statements: List[str], context: str
209+
) -> List[int]:
210+
"""Evaluate faithfulness of statements against context using NLI."""
211+
prompt = nli_statement_prompt(context, statements)
212+
result = await self.llm.agenerate(prompt, NLIStatementOutput)
213+
214+
verdict_list = [
215+
1 if statement.verdict else 0 for statement in result.statements
216+
]
217+
return verdict_list
218+
219+
def _compute_score(self, answers: Dict) -> float:
220+
"""Compute noise sensitivity score from faithfulness matrices."""
221+
incorrect = ~answers["ground_truth2answer"]
222+
223+
# Compute relevant retrievals (needed for both modes)
224+
relevant_retrieved = np.max(
225+
answers["retrieved2ground_truth"], axis=0, keepdims=True
226+
)
227+
relevant_faithful = np.max(
228+
relevant_retrieved & answers["retrieved2answer"], axis=1
229+
)
230+
231+
if self.mode == "irrelevant":
232+
# Compute irrelevant retrievals
233+
irrelevant_retrieved = ~relevant_retrieved
234+
irrelevant_faithful = np.max(
235+
irrelevant_retrieved & answers["retrieved2answer"], axis=1
236+
)
237+
238+
# Keep them exclusive (irrelevant should not include relevant)
239+
irrelevant_faithful &= ~relevant_faithful
240+
241+
return float(np.mean(irrelevant_faithful & incorrect))
242+
243+
else: # mode == "relevant"
244+
return float(np.mean(relevant_faithful & incorrect))
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""Noise Sensitivity prompts - V1-identical using exact PydanticPrompt.to_string() output."""
2+
3+
import json
4+
import typing as t
5+
6+
7+
def nli_statement_prompt(context: str, statements: t.List[str]) -> str:
8+
"""
9+
V1-identical NLI statement evaluation - matches PydanticPrompt.to_string() exactly.
10+
11+
Args:
12+
context: The context to evaluate statements against
13+
statements: The statements to judge for faithfulness
14+
15+
Returns:
16+
V1-identical prompt string for the LLM
17+
"""
18+
# Format inputs exactly like V1's model_dump_json(indent=4, exclude_none=True)
19+
safe_context = json.dumps(context)
20+
safe_statements = json.dumps(statements, indent=4).replace("\n", "\n ")
21+
22+
return f"""Your task is to judge the faithfulness of a series of statements based on a given context. For each statement you must return verdict as 1 if the statement can be directly inferred based on the context or 0 if the statement can not be directly inferred based on the context.
23+
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
24+
{{"$defs": {{"StatementFaithfulnessAnswer": {{"properties": {{"statement": {{"description": "the original statement, word-by-word", "title": "Statement", "type": "string"}}, "reason": {{"description": "the reason of the verdict", "title": "Reason", "type": "string"}}, "verdict": {{"description": "the verdict(0/1) of the faithfulness.", "title": "Verdict", "type": "integer"}}}}, "required": ["statement", "reason", "verdict"], "title": "StatementFaithfulnessAnswer", "type": "object"}}}}, "properties": {{"statements": {{"items": {{"$ref": "#/$defs/StatementFaithfulnessAnswer"}}, "title": "Statements", "type": "array"}}}}, "required": ["statements"], "title": "NLIStatementOutput", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash.
25+
26+
--------EXAMPLES-----------
27+
Example 1
28+
Input: {{
29+
"context": "John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.",
30+
"statements": [
31+
"John is majoring in Biology.",
32+
"John is taking a course on Artificial Intelligence.",
33+
"John is a dedicated student.",
34+
"John has a part-time job."
35+
]
36+
}}
37+
Output: {{
38+
"statements": [
39+
{{
40+
"statement": "John is majoring in Biology.",
41+
"reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.",
42+
"verdict": 0
43+
}},
44+
{{
45+
"statement": "John is taking a course on Artificial Intelligence.",
46+
"reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.",
47+
"verdict": 0
48+
}},
49+
{{
50+
"statement": "John is a dedicated student.",
51+
"reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.",
52+
"verdict": 1
53+
}},
54+
{{
55+
"statement": "John has a part-time job.",
56+
"reason": "There is no information given in the context about John having a part-time job.",
57+
"verdict": 0
58+
}}
59+
]
60+
}}
61+
62+
Example 2
63+
Input: {{
64+
"context": "Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.",
65+
"statements": [
66+
"Albert Einstein was a genius."
67+
]
68+
}}
69+
Output: {{
70+
"statements": [
71+
{{
72+
"statement": "Albert Einstein was a genius.",
73+
"reason": "The context and statement are unrelated",
74+
"verdict": 0
75+
}}
76+
]
77+
}}
78+
-----------------------------
79+
80+
Now perform the same with the following input
81+
input: {{
82+
"context": {safe_context},
83+
"statements": {safe_statements}
84+
}}
85+
Output: """

0 commit comments

Comments
 (0)