Skip to content

Commit ee40879

Browse files
committed
Migrate factual correctness
1 parent b2b28d7 commit ee40879

File tree

4 files changed

+612
-2
lines changed

4 files changed

+612
-2
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from ragas.metrics.collections._bleu_score import BleuScore
88
from ragas.metrics.collections._context_entity_recall import ContextEntityRecall
99
from ragas.metrics.collections._context_relevance import ContextRelevance
10+
from ragas.metrics.collections._factual_correctness import FactualCorrectness
1011
from ragas.metrics.collections._faithfulness import Faithfulness
1112
from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
1213
from ragas.metrics.collections._rouge_score import RougeScore
@@ -31,6 +32,7 @@
3132
"ContextRelevance",
3233
"DistanceMeasure",
3334
"ExactMatch",
35+
"FactualCorrectness",
3436
"Faithfulness",
3537
"NoiseSensitivity",
3638
"NonLLMStringSimilarity",
Lines changed: 359 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,359 @@
1+
"""Factual Correctness metric v2 - Modern implementation with multi-modal scoring."""
2+
3+
import typing as t
4+
from typing import List
5+
6+
import numpy as np
7+
from pydantic import BaseModel
8+
9+
from ragas.metrics.collections.base import BaseMetric
10+
from ragas.metrics.result import MetricResult
11+
from ragas.metrics.utils import fbeta_score
12+
from ragas.prompt.metrics.common import nli_statement_prompt
13+
14+
if t.TYPE_CHECKING:
15+
from ragas.llms.base import InstructorBaseRagasLLM
16+
17+
18+
class ClaimDecompositionOutput(BaseModel):
19+
"""Structured output for claim decomposition."""
20+
21+
claims: List[str]
22+
23+
24+
class StatementFaithfulnessAnswer(BaseModel):
25+
"""Individual statement with reason and verdict for NLI evaluation."""
26+
27+
statement: str
28+
reason: str
29+
verdict: int
30+
31+
32+
class NLIStatementOutput(BaseModel):
33+
"""Structured output for NLI statement evaluation."""
34+
35+
statements: List[StatementFaithfulnessAnswer]
36+
37+
38+
def claim_decomposition_prompt(
39+
response: str, atomicity: str = "low", coverage: str = "low"
40+
) -> str:
41+
"""
42+
V1-identical claim decomposition prompt with configurable atomicity/coverage.
43+
44+
Args:
45+
response: The response text to break down into claims
46+
atomicity: Level of atomicity ("low" or "high")
47+
coverage: Level of coverage ("low" or "high")
48+
49+
Returns:
50+
V1-identical prompt string for the LLM
51+
"""
52+
import json
53+
54+
safe_response = json.dumps(response)
55+
56+
# Select examples based on atomicity and coverage configuration
57+
if atomicity == "low" and coverage == "low":
58+
examples = [
59+
{
60+
"input": {
61+
"response": "Charles Babbage was a French mathematician, philosopher, and food critic."
62+
},
63+
"output": {
64+
"claims": ["Charles Babbage was a mathematician and philosopher."]
65+
},
66+
},
67+
{
68+
"input": {
69+
"response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics."
70+
},
71+
"output": {
72+
"claims": [
73+
"Albert Einstein was a German physicist.",
74+
"Albert Einstein developed relativity and contributed to quantum mechanics.",
75+
]
76+
},
77+
},
78+
]
79+
elif atomicity == "low" and coverage == "high":
80+
examples = [
81+
{
82+
"input": {
83+
"response": "Charles Babbage was a French mathematician, philosopher, and food critic."
84+
},
85+
"output": {
86+
"claims": [
87+
"Charles Babbage was a French mathematician, philosopher, and food critic."
88+
]
89+
},
90+
},
91+
{
92+
"input": {
93+
"response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics."
94+
},
95+
"output": {
96+
"claims": [
97+
"Albert Einstein was a German theoretical physicist.",
98+
"Albert Einstein developed the theory of relativity and also contributed to the development of quantum mechanics.",
99+
]
100+
},
101+
},
102+
]
103+
elif atomicity == "high" and coverage == "low":
104+
examples = [
105+
{
106+
"input": {
107+
"response": "Charles Babbage was a French mathematician, philosopher, and food critic."
108+
},
109+
"output": {
110+
"claims": [
111+
"Charles Babbage was a mathematician.",
112+
"Charles Babbage was a philosopher.",
113+
]
114+
},
115+
},
116+
{
117+
"input": {
118+
"response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics."
119+
},
120+
"output": {
121+
"claims": [
122+
"Albert Einstein was a German theoretical physicist.",
123+
"Albert Einstein developed the theory of relativity.",
124+
]
125+
},
126+
},
127+
]
128+
else: # high atomicity, high coverage
129+
examples = [
130+
{
131+
"input": {
132+
"response": "Charles Babbage was a French mathematician, philosopher, and food critic."
133+
},
134+
"output": {
135+
"claims": [
136+
"Charles Babbage was a mathematician.",
137+
"Charles Babbage was a philosopher.",
138+
"Charles Babbage was a food critic.",
139+
"Charles Babbage was French.",
140+
]
141+
},
142+
},
143+
{
144+
"input": {
145+
"response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics."
146+
},
147+
"output": {
148+
"claims": [
149+
"Albert Einstein was a German theoretical physicist.",
150+
"Albert Einstein developed the theory of relativity.",
151+
"Albert Einstein contributed to the development of quantum mechanics.",
152+
]
153+
},
154+
},
155+
]
156+
157+
# Build examples string
158+
examples_str = "\n".join(
159+
[
160+
f"""Example {i + 1}
161+
Input: {json.dumps(ex["input"], indent=4)}
162+
Output: {json.dumps(ex["output"], indent=4)}"""
163+
for i, ex in enumerate(examples)
164+
]
165+
)
166+
167+
return f"""Decompose and break down each of the input sentences into one or more standalone statements. Each statement should be a standalone claim that can be independently verified.
168+
Follow the level of atomicity and coverage as shown in the examples.
169+
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
170+
{{"properties": {{"claims": {{"description": "Decomposed Claims", "items": {{"type": "string"}}, "title": "Claims", "type": "array"}}}}, "required": ["claims"], "title": "ClaimDecompositionOutput", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash.
171+
172+
--------EXAMPLES-----------
173+
{examples_str}
174+
-----------------------------
175+
176+
Now perform the same with the following input
177+
input: {{
178+
"response": {safe_response}
179+
}}
180+
Output: """
181+
182+
183+
class FactualCorrectness(BaseMetric):
184+
"""
185+
Modern v2 implementation of factual correctness evaluation.
186+
187+
Evaluates the factual correctness of responses by comparing claims made in the response
188+
against a reference text. Uses claim decomposition and natural language inference (NLI)
189+
to verify claims in both directions.
190+
191+
The metric supports three evaluation modes:
192+
- Precision: What fraction of response claims are supported by reference
193+
- Recall: What fraction of reference claims are covered by response
194+
- F1: Harmonic mean of precision and recall (with configurable beta)
195+
196+
The metric also supports configurable claim decomposition:
197+
- Atomicity: "low" (fewer, broader claims) vs "high" (more, atomic claims)
198+
- Coverage: "low" (partial coverage) vs "high" (comprehensive coverage)
199+
200+
Usage:
201+
>>> import instructor
202+
>>> from openai import AsyncOpenAI
203+
>>> from ragas.llms.base import llm_factory
204+
>>> from ragas.metrics.collections import FactualCorrectness
205+
>>>
206+
>>> # Setup dependencies
207+
>>> client = AsyncOpenAI()
208+
>>> llm = llm_factory("gpt-4o-mini", client=client)
209+
>>>
210+
>>> # Create metric instance
211+
>>> metric = FactualCorrectness(llm=llm, mode="f1", beta=1.0)
212+
>>>
213+
>>> # Single evaluation
214+
>>> result = await metric.ascore(
215+
... response="Einstein was born in Germany in 1879.",
216+
... reference="Albert Einstein was born in Ulm, Germany on March 14, 1879."
217+
... )
218+
>>> print(f"Factual Correctness: {result.value}")
219+
220+
Attributes:
221+
llm: Modern instructor-based LLM for claim decomposition and NLI evaluation
222+
mode: Evaluation mode ("precision", "recall", or "f1")
223+
beta: Beta parameter for F1 score (>1 favors recall, <1 favors precision)
224+
atomicity: Claim decomposition atomicity ("low" or "high")
225+
coverage: Claim decomposition coverage ("low" or "high")
226+
name: The metric name
227+
allowed_values: Score range (0.0 to 1.0, higher is better)
228+
"""
229+
230+
# Type hints for linter (attributes are set in __init__)
231+
llm: "InstructorBaseRagasLLM"
232+
233+
def __init__(
234+
self,
235+
llm: "InstructorBaseRagasLLM",
236+
mode: t.Literal["precision", "recall", "f1"] = "f1",
237+
beta: float = 1.0,
238+
atomicity: t.Literal["low", "high"] = "low",
239+
coverage: t.Literal["low", "high"] = "low",
240+
name: str = "factual_correctness",
241+
**kwargs,
242+
):
243+
"""
244+
Initialize FactualCorrectness metric with required components.
245+
246+
Args:
247+
llm: Modern instructor-based LLM for claim decomposition and NLI evaluation
248+
mode: Evaluation mode ("precision", "recall", or "f1")
249+
beta: Beta parameter for F1 score (>1 favors recall, <1 favors precision)
250+
atomicity: Claim decomposition atomicity ("low" or "high")
251+
coverage: Claim decomposition coverage ("low" or "high")
252+
name: The metric name
253+
"""
254+
# Set attributes explicitly before calling super()
255+
self.llm = llm
256+
self.mode = mode
257+
self.beta = beta
258+
self.atomicity = atomicity
259+
self.coverage = coverage
260+
261+
# Validate beta parameter
262+
if not isinstance(beta, (int, float)):
263+
raise ValueError(
264+
"Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision."
265+
)
266+
267+
# Call super() for validation (without passing llm in kwargs)
268+
super().__init__(name=name, **kwargs)
269+
270+
async def ascore(self, response: str, reference: str) -> MetricResult:
271+
"""
272+
Calculate factual correctness score.
273+
274+
Args:
275+
response: The response to evaluate for factual correctness
276+
reference: The reference text to check claims against
277+
278+
Returns:
279+
MetricResult with factual correctness score (0.0-1.0, higher is better)
280+
"""
281+
# Input validation
282+
if not response:
283+
raise ValueError(
284+
"response is missing. Please add response to the test sample."
285+
)
286+
if not reference:
287+
raise ValueError(
288+
"reference is missing. Please add reference to the test sample."
289+
)
290+
291+
# Step 1: Get claim verifications based on mode
292+
if self.mode != "precision":
293+
# For recall and f1: response claims → reference verification
294+
response_verified = await self._decompose_and_verify_claims(
295+
response, reference
296+
)
297+
else:
298+
response_verified = np.array([], dtype=bool)
299+
300+
if self.mode != "recall":
301+
# For precision and f1: reference claims → response verification
302+
reference_verified = await self._decompose_and_verify_claims(
303+
reference, response
304+
)
305+
else:
306+
reference_verified = np.array([], dtype=bool)
307+
308+
# Step 2: Compute TP, FP, FN
309+
if self.mode != "precision":
310+
tp = int(np.sum(response_verified))
311+
fn = int(np.sum(~response_verified))
312+
else:
313+
tp = int(np.sum(reference_verified))
314+
fn = 0
315+
316+
if self.mode != "recall":
317+
fp = int(np.sum(~reference_verified))
318+
else:
319+
fp = 0
320+
321+
# Step 3: Compute final score based on mode
322+
if self.mode == "precision":
323+
score = tp / (tp + fp + 1e-8)
324+
elif self.mode == "recall":
325+
score = tp / (tp + fn + 1e-8)
326+
else: # f1
327+
score = fbeta_score(tp, fp, fn, self.beta)
328+
329+
return MetricResult(value=float(np.round(score, 2)))
330+
331+
async def _decompose_claims(self, response: str) -> List[str]:
332+
"""Break response into claims using configurable decomposition."""
333+
prompt = claim_decomposition_prompt(
334+
response, atomicity=self.atomicity, coverage=self.coverage
335+
)
336+
result = await self.llm.agenerate(prompt, ClaimDecompositionOutput)
337+
return result.claims
338+
339+
async def _verify_claims(
340+
self, claims: List[str], reference: str
341+
) -> NLIStatementOutput:
342+
"""Verify claims against reference using NLI."""
343+
prompt = nli_statement_prompt(reference, claims)
344+
result = await self.llm.agenerate(prompt, NLIStatementOutput)
345+
return result
346+
347+
async def _decompose_and_verify_claims(
348+
self, text_to_decompose: str, reference_text: str
349+
) -> np.ndarray:
350+
"""Decompose text into claims and verify against reference."""
351+
claims = await self._decompose_claims(text_to_decompose)
352+
if not claims:
353+
return np.array([], dtype=bool)
354+
355+
verdicts = await self._verify_claims(claims, reference_text)
356+
if not verdicts.statements:
357+
return np.array([], dtype=bool)
358+
359+
return np.array([bool(stmt.verdict) for stmt in verdicts.statements])

0 commit comments

Comments
 (0)