Skip to content

Commit 074e109

Browse files
authored
Migrate factual correctness (#2401)
1 parent 5b64f89 commit 074e109

File tree

5 files changed

+607
-2
lines changed

5 files changed

+607
-2
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
ContextUtilization,
1414
)
1515
from ragas.metrics.collections._context_relevance import ContextRelevance
16+
from ragas.metrics.collections._factual_correctness import FactualCorrectness
1617
from ragas.metrics.collections._faithfulness import Faithfulness
1718
from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
1819
from ragas.metrics.collections._rouge_score import RougeScore
@@ -41,6 +42,7 @@
4142
"ContextUtilization",
4243
"DistanceMeasure",
4344
"ExactMatch",
45+
"FactualCorrectness",
4446
"Faithfulness",
4547
"NoiseSensitivity",
4648
"NonLLMStringSimilarity",
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
"""Factual Correctness metric v2 - Modern implementation with multi-modal scoring."""
2+
3+
import typing as t
4+
from typing import List
5+
6+
import numpy as np
7+
from pydantic import BaseModel
8+
9+
from ragas.metrics.collections.base import BaseMetric
10+
from ragas.metrics.result import MetricResult
11+
from ragas.metrics.utils import fbeta_score
12+
from ragas.prompt.metrics.common import nli_statement_prompt
13+
from ragas.prompt.metrics.factual_correctness import claim_decomposition_prompt
14+
15+
if t.TYPE_CHECKING:
16+
from ragas.llms.base import InstructorBaseRagasLLM
17+
18+
19+
class ClaimDecompositionOutput(BaseModel):
20+
"""Structured output for claim decomposition."""
21+
22+
claims: List[str]
23+
24+
25+
class StatementFaithfulnessAnswer(BaseModel):
26+
"""Individual statement with reason and verdict for NLI evaluation."""
27+
28+
statement: str
29+
reason: str
30+
verdict: int
31+
32+
33+
class NLIStatementOutput(BaseModel):
34+
"""Structured output for NLI statement evaluation."""
35+
36+
statements: List[StatementFaithfulnessAnswer]
37+
38+
39+
class FactualCorrectness(BaseMetric):
40+
"""
41+
Modern v2 implementation of factual correctness evaluation.
42+
43+
Evaluates the factual correctness of responses by comparing claims made in the response
44+
against a reference text. Uses claim decomposition and natural language inference (NLI)
45+
to verify claims in both directions.
46+
47+
The metric supports three evaluation modes:
48+
- Precision: What fraction of response claims are supported by reference
49+
- Recall: What fraction of reference claims are covered by response
50+
- F1: Harmonic mean of precision and recall (with configurable beta)
51+
52+
The metric also supports configurable claim decomposition:
53+
- Atomicity: "low" (fewer, broader claims) vs "high" (more, atomic claims)
54+
- Coverage: "low" (partial coverage) vs "high" (comprehensive coverage)
55+
56+
Usage:
57+
>>> import instructor
58+
>>> from openai import AsyncOpenAI
59+
>>> from ragas.llms.base import llm_factory
60+
>>> from ragas.metrics.collections import FactualCorrectness
61+
>>>
62+
>>> # Setup dependencies
63+
>>> client = AsyncOpenAI()
64+
>>> llm = llm_factory("gpt-4o-mini", client=client)
65+
>>>
66+
>>> # Create metric instance
67+
>>> metric = FactualCorrectness(llm=llm, mode="f1", beta=1.0)
68+
>>>
69+
>>> # Single evaluation
70+
>>> result = await metric.ascore(
71+
... response="Einstein was born in Germany in 1879.",
72+
... reference="Albert Einstein was born in Ulm, Germany on March 14, 1879."
73+
... )
74+
>>> print(f"Factual Correctness: {result.value}")
75+
76+
Attributes:
77+
llm: Modern instructor-based LLM for claim decomposition and NLI evaluation
78+
mode: Evaluation mode ("precision", "recall", or "f1")
79+
beta: Beta parameter for F1 score (>1 favors recall, <1 favors precision)
80+
atomicity: Claim decomposition atomicity ("low" or "high")
81+
coverage: Claim decomposition coverage ("low" or "high")
82+
name: The metric name
83+
allowed_values: Score range (0.0 to 1.0, higher is better)
84+
"""
85+
86+
# Type hints for linter (attributes are set in __init__)
87+
llm: "InstructorBaseRagasLLM"
88+
89+
def __init__(
90+
self,
91+
llm: "InstructorBaseRagasLLM",
92+
mode: t.Literal["precision", "recall", "f1"] = "f1",
93+
beta: float = 1.0,
94+
atomicity: t.Literal["low", "high"] = "low",
95+
coverage: t.Literal["low", "high"] = "low",
96+
name: str = "factual_correctness",
97+
**kwargs,
98+
):
99+
"""
100+
Initialize FactualCorrectness metric with required components.
101+
102+
Args:
103+
llm: Modern instructor-based LLM for claim decomposition and NLI evaluation
104+
mode: Evaluation mode ("precision", "recall", or "f1")
105+
beta: Beta parameter for F1 score (>1 favors recall, <1 favors precision)
106+
atomicity: Claim decomposition atomicity ("low" or "high")
107+
coverage: Claim decomposition coverage ("low" or "high")
108+
name: The metric name
109+
"""
110+
# Set attributes explicitly before calling super()
111+
self.llm = llm
112+
self.mode = mode
113+
self.beta = beta
114+
self.atomicity = atomicity
115+
self.coverage = coverage
116+
117+
# Validate beta parameter
118+
if not isinstance(beta, (int, float)):
119+
raise ValueError(
120+
"Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision."
121+
)
122+
123+
# Call super() for validation (without passing llm in kwargs)
124+
super().__init__(name=name, **kwargs)
125+
126+
async def ascore(self, response: str, reference: str) -> MetricResult:
127+
"""
128+
Calculate factual correctness score.
129+
130+
Args:
131+
response: The response to evaluate for factual correctness
132+
reference: The reference text to check claims against
133+
134+
Returns:
135+
MetricResult with factual correctness score (0.0-1.0, higher is better)
136+
"""
137+
# Input validation
138+
if not response:
139+
raise ValueError(
140+
"response is missing. Please add response to the test sample."
141+
)
142+
if not reference:
143+
raise ValueError(
144+
"reference is missing. Please add reference to the test sample."
145+
)
146+
147+
# Step 1: Get claim verifications to match legacy behavior exactly
148+
# Legacy always does: decompose response → verify against reference
149+
reference_response = await self._decompose_and_verify_claims(
150+
response, reference
151+
)
152+
153+
if self.mode != "precision":
154+
# For recall and f1, also do: decompose reference → verify against response
155+
response_reference = await self._decompose_and_verify_claims(
156+
reference, response
157+
)
158+
else:
159+
response_reference = np.array([], dtype=bool)
160+
161+
# Step 2: Compute TP, FP, FN exactly like legacy
162+
tp = int(np.sum(reference_response))
163+
fp = int(np.sum(~reference_response))
164+
if self.mode != "precision":
165+
fn = int(np.sum(~response_reference))
166+
else:
167+
fn = 0
168+
169+
# Step 3: Compute final score based on mode
170+
if self.mode == "precision":
171+
score = tp / (tp + fp + 1e-8)
172+
elif self.mode == "recall":
173+
score = tp / (tp + fn + 1e-8)
174+
else: # f1
175+
score = fbeta_score(tp, fp, fn, self.beta)
176+
177+
return MetricResult(value=float(np.round(score, 2)))
178+
179+
async def _decompose_claims(self, response: str) -> List[str]:
180+
"""Break response into claims using configurable decomposition."""
181+
prompt = claim_decomposition_prompt(
182+
response, atomicity=self.atomicity, coverage=self.coverage
183+
)
184+
result = await self.llm.agenerate(prompt, ClaimDecompositionOutput)
185+
return result.claims
186+
187+
async def _verify_claims(
188+
self, claims: List[str], reference: str
189+
) -> NLIStatementOutput:
190+
"""Verify claims against reference using NLI."""
191+
prompt = nli_statement_prompt(reference, claims)
192+
result = await self.llm.agenerate(prompt, NLIStatementOutput)
193+
return result
194+
195+
async def _decompose_and_verify_claims(
196+
self, text_to_decompose: str, reference_text: str
197+
) -> np.ndarray:
198+
"""Decompose text into claims and verify against reference."""
199+
claims = await self._decompose_claims(text_to_decompose)
200+
if not claims:
201+
return np.array([], dtype=bool)
202+
203+
verdicts = await self._verify_claims(claims, reference_text)
204+
if not verdicts.statements:
205+
return np.array([], dtype=bool)
206+
207+
return np.array([bool(stmt.verdict) for stmt in verdicts.statements])
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
"""Factual correctness prompts - V1-identical converted to functions."""
2+
3+
import json
4+
5+
6+
def claim_decomposition_prompt(
7+
response: str, atomicity: str = "low", coverage: str = "low"
8+
) -> str:
9+
"""
10+
V1-identical claim decomposition prompt with configurable atomicity/coverage.
11+
12+
Args:
13+
response: The response text to break down into claims
14+
atomicity: Level of atomicity ("low" or "high")
15+
coverage: Level of coverage ("low" or "high")
16+
17+
Returns:
18+
V1-identical prompt string for the LLM
19+
"""
20+
safe_response = json.dumps(response)
21+
22+
# Select examples based on atomicity and coverage configuration
23+
if atomicity == "low" and coverage == "low":
24+
examples = [
25+
{
26+
"input": {
27+
"response": "Charles Babbage was a French mathematician, philosopher, and food critic."
28+
},
29+
"output": {
30+
"claims": ["Charles Babbage was a mathematician and philosopher."]
31+
},
32+
},
33+
{
34+
"input": {
35+
"response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics."
36+
},
37+
"output": {
38+
"claims": [
39+
"Albert Einstein was a German physicist.",
40+
"Albert Einstein developed relativity and contributed to quantum mechanics.",
41+
]
42+
},
43+
},
44+
]
45+
elif atomicity == "low" and coverage == "high":
46+
examples = [
47+
{
48+
"input": {
49+
"response": "Charles Babbage was a French mathematician, philosopher, and food critic."
50+
},
51+
"output": {
52+
"claims": [
53+
"Charles Babbage was a French mathematician, philosopher, and food critic."
54+
]
55+
},
56+
},
57+
{
58+
"input": {
59+
"response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics."
60+
},
61+
"output": {
62+
"claims": [
63+
"Albert Einstein was a German theoretical physicist.",
64+
"Albert Einstein developed the theory of relativity and also contributed to the development of quantum mechanics.",
65+
]
66+
},
67+
},
68+
]
69+
elif atomicity == "high" and coverage == "low":
70+
examples = [
71+
{
72+
"input": {
73+
"response": "Charles Babbage was a French mathematician, philosopher, and food critic."
74+
},
75+
"output": {
76+
"claims": [
77+
"Charles Babbage was a mathematician.",
78+
"Charles Babbage was a philosopher.",
79+
]
80+
},
81+
},
82+
{
83+
"input": {
84+
"response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics."
85+
},
86+
"output": {
87+
"claims": [
88+
"Albert Einstein was a German theoretical physicist.",
89+
"Albert Einstein developed the theory of relativity.",
90+
]
91+
},
92+
},
93+
]
94+
else: # high atomicity, high coverage
95+
examples = [
96+
{
97+
"input": {
98+
"response": "Charles Babbage was a French mathematician, philosopher, and food critic."
99+
},
100+
"output": {
101+
"claims": [
102+
"Charles Babbage was a mathematician.",
103+
"Charles Babbage was a philosopher.",
104+
"Charles Babbage was a food critic.",
105+
"Charles Babbage was French.",
106+
]
107+
},
108+
},
109+
{
110+
"input": {
111+
"response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics."
112+
},
113+
"output": {
114+
"claims": [
115+
"Albert Einstein was a German theoretical physicist.",
116+
"Albert Einstein developed the theory of relativity.",
117+
"Albert Einstein contributed to the development of quantum mechanics.",
118+
]
119+
},
120+
},
121+
]
122+
123+
# Build examples string
124+
examples_str = "\n".join(
125+
[
126+
f"""Example {i + 1}
127+
Input: {json.dumps(ex["input"], indent=4)}
128+
Output: {json.dumps(ex["output"], indent=4)}"""
129+
for i, ex in enumerate(examples)
130+
]
131+
)
132+
133+
return f"""Decompose and break down each of the input sentences into one or more standalone statements. Each statement should be a standalone claim that can be independently verified.
134+
Follow the level of atomicity and coverage as shown in the examples.
135+
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
136+
{{"properties": {{"claims": {{"description": "Decomposed Claims", "items": {{"type": "string"}}, "title": "Claims", "type": "array"}}}}, "required": ["claims"], "title": "ClaimDecompositionOutput", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash.
137+
138+
--------EXAMPLES-----------
139+
{examples_str}
140+
-----------------------------
141+
142+
Now perform the same with the following input
143+
input: {{
144+
"response": {safe_response}
145+
}}
146+
Output: """

0 commit comments

Comments
 (0)