Skip to content

Commit 09d22fc

Browse files
authored
Migrate Faithfullness (#2384)
1 parent b2d7693 commit 09d22fc

File tree

9 files changed

+464
-89
lines changed

9 files changed

+464
-89
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
)
1414
from ragas.metrics.collections._bleu_score import BleuScore
1515
from ragas.metrics.collections._context_entity_recall import ContextEntityRecall
16+
from ragas.metrics.collections._faithfulness import Faithfulness
1617
from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
1718
from ragas.metrics.collections._rouge_score import RougeScore
1819
from ragas.metrics.collections._semantic_similarity import SemanticSimilarity
@@ -36,6 +37,7 @@
3637
"ContextEntityRecall",
3738
"DistanceMeasure",
3839
"ExactMatch",
40+
"Faithfulness",
3941
"NoiseSensitivity",
4042
"NonLLMStringSimilarity",
4143
"RougeScore",

src/ragas/metrics/collections/_answer_correctness.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@
88

99
from ragas.metrics.collections.base import BaseMetric
1010
from ragas.metrics.result import MetricResult
11-
from ragas.prompt.metrics.answer_correctness import (
12-
correctness_classifier_prompt,
13-
statement_generator_prompt,
14-
)
11+
from ragas.prompt.metrics.answer_correctness import correctness_classifier_prompt
12+
from ragas.prompt.metrics.common import statement_generator_prompt
1513

1614
if t.TYPE_CHECKING:
1715
from ragas.embeddings.base import BaseRagasEmbedding
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
"""Faithfulness metric v2 - Modern implementation with function-based prompts."""
2+
3+
import typing as t
4+
from typing import List
5+
6+
from pydantic import BaseModel
7+
8+
from ragas.metrics.collections.base import BaseMetric
9+
from ragas.metrics.result import MetricResult
10+
from ragas.prompt.metrics.common import nli_statement_prompt, statement_generator_prompt
11+
12+
if t.TYPE_CHECKING:
13+
from ragas.llms.base import InstructorBaseRagasLLM
14+
15+
16+
class StatementGeneratorOutput(BaseModel):
17+
"""Structured output for statement generation."""
18+
19+
statements: List[str]
20+
21+
22+
class StatementFaithfulnessAnswer(BaseModel):
23+
"""Individual statement with reason and verdict for NLI evaluation."""
24+
25+
statement: str
26+
reason: str
27+
verdict: int
28+
29+
30+
class NLIStatementOutput(BaseModel):
31+
"""Structured output for NLI statement evaluation."""
32+
33+
statements: List[StatementFaithfulnessAnswer]
34+
35+
36+
class Faithfulness(BaseMetric):
37+
"""
38+
Modern v2 implementation of faithfulness evaluation.
39+
40+
Measures how factually consistent a response is with the retrieved context.
41+
A response is considered faithful if all its claims can be supported by the context.
42+
43+
The metric works by:
44+
1. Breaking down the response into atomic statements
45+
2. Checking each statement against the retrieved contexts using NLI
46+
3. Computing faithfulness as the ratio of supported statements
47+
48+
This implementation uses modern instructor LLMs with structured output.
49+
Only supports modern components - legacy wrappers are rejected with clear error messages.
50+
51+
Usage:
52+
>>> import instructor
53+
>>> from openai import AsyncOpenAI
54+
>>> from ragas.llms.base import instructor_llm_factory
55+
>>> from ragas.metrics.collections import Faithfulness
56+
>>>
57+
>>> # Setup dependencies
58+
>>> client = AsyncOpenAI()
59+
>>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o-mini")
60+
>>>
61+
>>> # Create metric instance
62+
>>> metric = Faithfulness(llm=llm)
63+
>>>
64+
>>> # Single evaluation
65+
>>> result = await metric.ascore(
66+
... user_input="Where was Einstein born?",
67+
... response="Einstein was born in Germany on 14th March 1879.",
68+
... retrieved_contexts=["Albert Einstein was born in Germany..."]
69+
... )
70+
>>> print(f"Faithfulness Score: {result.value}")
71+
72+
Attributes:
73+
llm: Modern instructor-based LLM for statement generation and NLI evaluation
74+
name: The metric name
75+
allowed_values: Score range (0.0 to 1.0, higher is better)
76+
"""
77+
78+
# Type hints for linter (attributes are set in __init__)
79+
llm: "InstructorBaseRagasLLM"
80+
81+
def __init__(
82+
self,
83+
llm: "InstructorBaseRagasLLM",
84+
name: str = "faithfulness",
85+
**kwargs,
86+
):
87+
"""
88+
Initialize Faithfulness metric with required components.
89+
90+
Args:
91+
llm: Modern instructor-based LLM for statement generation and NLI evaluation
92+
name: The metric name
93+
"""
94+
# Set attributes explicitly before calling super()
95+
self.llm = llm
96+
97+
# Call super() for validation (without passing llm in kwargs)
98+
super().__init__(name=name, **kwargs)
99+
100+
async def ascore(
101+
self, user_input: str, response: str, retrieved_contexts: List[str]
102+
) -> MetricResult:
103+
"""
104+
Calculate faithfulness score.
105+
106+
Args:
107+
user_input: The original question
108+
response: The response to evaluate for faithfulness
109+
retrieved_contexts: The retrieved contexts to check against
110+
111+
Returns:
112+
MetricResult with faithfulness score (0.0-1.0, higher is better)
113+
"""
114+
# Input validation
115+
if not response:
116+
raise ValueError(
117+
"response is missing. Please add response to the test sample."
118+
)
119+
if not user_input:
120+
raise ValueError(
121+
"user_input is missing. Please add user_input to the test sample."
122+
)
123+
if not retrieved_contexts:
124+
raise ValueError(
125+
"retrieved_contexts is missing. Please add retrieved_contexts to the test sample."
126+
)
127+
128+
# Step 1: Break response into atomic statements
129+
statements = await self._create_statements(user_input, response)
130+
131+
if not statements:
132+
# No statements generated - return NaN like legacy
133+
return MetricResult(value=float("nan"))
134+
135+
# Step 2: Join all contexts and evaluate statements against them
136+
context_str = "\n".join(retrieved_contexts)
137+
verdicts = await self._create_verdicts(statements, context_str)
138+
139+
# Step 3: Compute faithfulness score
140+
score = self._compute_score(verdicts)
141+
142+
return MetricResult(value=float(score))
143+
144+
async def _create_statements(self, question: str, response: str) -> List[str]:
145+
"""Break response into atomic statements using statement generator."""
146+
prompt = statement_generator_prompt(question, response)
147+
result = await self.llm.agenerate(prompt, StatementGeneratorOutput)
148+
return result.statements
149+
150+
async def _create_verdicts(
151+
self, statements: List[str], context: str
152+
) -> NLIStatementOutput:
153+
"""Evaluate statement faithfulness against context using NLI."""
154+
prompt = nli_statement_prompt(context, statements)
155+
result = await self.llm.agenerate(prompt, NLIStatementOutput)
156+
return result
157+
158+
def _compute_score(self, verdicts: NLIStatementOutput) -> float:
159+
"""Compute faithfulness score as ratio of faithful statements."""
160+
if not verdicts.statements:
161+
return float("nan")
162+
163+
faithful_statements = sum(
164+
1 if statement.verdict else 0 for statement in verdicts.statements
165+
)
166+
num_statements = len(verdicts.statements)
167+
168+
if num_statements > 0:
169+
score = faithful_statements / num_statements
170+
else:
171+
score = float("nan")
172+
173+
return score

src/ragas/metrics/collections/_noise_sensitivity.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@
88

99
from ragas.metrics.collections.base import BaseMetric
1010
from ragas.metrics.result import MetricResult
11-
from ragas.prompt.metrics.answer_correctness import statement_generator_prompt
12-
from ragas.prompt.metrics.noise_sensitivity import nli_statement_prompt
11+
from ragas.prompt.metrics.common import nli_statement_prompt, statement_generator_prompt
1312

1413
if t.TYPE_CHECKING:
1514
from ragas.llms.base import InstructorBaseRagasLLM
Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
"""Metric-specific prompts for Ragas evaluation metrics."""
22

3-
from ragas.prompt.metrics.answer_correctness import (
4-
correctness_classifier_prompt,
5-
statement_generator_prompt,
6-
)
3+
from ragas.prompt.metrics.answer_correctness import correctness_classifier_prompt
74
from ragas.prompt.metrics.answer_relevance import answer_relevancy_prompt
5+
from ragas.prompt.metrics.common import nli_statement_prompt, statement_generator_prompt
86

97
__all__ = [
108
"answer_relevancy_prompt",
119
"correctness_classifier_prompt",
10+
"nli_statement_prompt",
1211
"statement_generator_prompt",
1312
]

src/ragas/prompt/metrics/answer_correctness.py

Lines changed: 15 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,31 @@
1-
"""Answer Correctness prompts for statement generation and classification."""
1+
"""Answer Correctness prompts for classification.
2+
3+
Note: statement_generator_prompt has been moved to ragas.prompt.metrics.common
4+
"""
25

36
import json
47
import typing as t
58

69

7-
def statement_generator_prompt(question: str, answer: str) -> str:
8-
"""
9-
V1-identical statement generator - matches PydanticPrompt.to_string() exactly.
10-
11-
Args:
12-
question: The question being answered
13-
answer: The answer text to break down into statements
14-
15-
Returns:
16-
V1-identical prompt string for the LLM
17-
"""
18-
# Format inputs exactly like V1's model_dump_json(indent=4, exclude_none=True)
19-
safe_question = json.dumps(question)
20-
safe_answer = json.dumps(answer)
21-
22-
return f"""Given a question and an answer, analyze the complexity of each sentence in the answer. Break down each sentence into one or more fully understandable statements. Ensure that no pronouns are used in any statement. IMPORTANT: Extract statements EXACTLY as they appear in the answer - do not correct factual errors or change any content. Format the outputs in JSON.
23-
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
24-
{{"properties": {{"statements": {{"description": "The generated statements", "items": {{"type": "string"}}, "title": "Statements", "type": "array"}}}}, "required": ["statements"], "title": "StatementGeneratorOutput", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash.
25-
26-
--------EXAMPLES-----------
27-
Example 1
28-
Input: {{
29-
"question": "Who was Albert Einstein and what is he best known for?",
30-
"answer": "He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics."
31-
}}
32-
Output: {{
33-
"statements": [
34-
"Albert Einstein was a German-born theoretical physicist.",
35-
"Albert Einstein is recognized as one of the greatest and most influential physicists of all time.",
36-
"Albert Einstein was best known for developing the theory of relativity.",
37-
"Albert Einstein made important contributions to the development of the theory of quantum mechanics."
38-
]
39-
}}
40-
-----------------------------
41-
42-
Now perform the same with the following input
43-
input: {{
44-
"question": {safe_question},
45-
"answer": {safe_answer}
46-
}}
47-
Output: """
48-
49-
5010
def correctness_classifier_prompt(
5111
question: str, answer_statements: t.List[str], ground_truth_statements: t.List[str]
5212
) -> str:
5313
"""
54-
V1-compatible correctness classifier using exact PydanticPrompt structure.
14+
V1-identical correctness classifier - matches PydanticPrompt.to_string() exactly.
5515
5616
Args:
5717
question: The original question
58-
answer_statements: List of statements from the answer
59-
ground_truth_statements: List of statements from the ground truth
18+
answer_statements: List of statements from the answer to evaluate
19+
ground_truth_statements: List of ground truth reference statements
6020
6121
Returns:
62-
V1-compatible prompt string for the LLM
22+
V1-identical prompt string for the LLM
6323
"""
6424
# Format inputs exactly like V1's model_dump_json(indent=4, exclude_none=True)
6525
safe_question = json.dumps(question)
66-
# Format lists with proper indentation to match V1's Pydantic formatting
67-
safe_answer = json.dumps(answer_statements, indent=4).replace("\n", "\n ")
26+
safe_answer_statements = json.dumps(answer_statements, indent=4).replace(
27+
"\n", "\n "
28+
)
6829
safe_ground_truth = json.dumps(ground_truth_statements, indent=4).replace(
6930
"\n", "\n "
7031
)
@@ -157,7 +118,10 @@ def correctness_classifier_prompt(
157118
Now perform the same with the following input
158119
input: {{
159120
"question": {safe_question},
160-
"answer": {safe_answer},
121+
"answer": {safe_answer_statements},
161122
"ground_truth": {safe_ground_truth}
162123
}}
163124
Output: """
125+
126+
127+
__all__ = ["correctness_classifier_prompt"]

0 commit comments

Comments
 (0)