|
| 1 | +"""Factual Correctness metric v2 - Modern implementation with multi-modal scoring.""" |
| 2 | + |
| 3 | +import typing as t |
| 4 | +from typing import List |
| 5 | + |
| 6 | +import numpy as np |
| 7 | +from pydantic import BaseModel |
| 8 | + |
| 9 | +from ragas.metrics.collections.base import BaseMetric |
| 10 | +from ragas.metrics.result import MetricResult |
| 11 | +from ragas.metrics.utils import fbeta_score |
| 12 | +from ragas.prompt.metrics.common import nli_statement_prompt |
| 13 | + |
| 14 | +if t.TYPE_CHECKING: |
| 15 | + from ragas.llms.base import InstructorBaseRagasLLM |
| 16 | + |
| 17 | + |
| 18 | +class ClaimDecompositionOutput(BaseModel): |
| 19 | + """Structured output for claim decomposition.""" |
| 20 | + |
| 21 | + claims: List[str] |
| 22 | + |
| 23 | + |
| 24 | +class StatementFaithfulnessAnswer(BaseModel): |
| 25 | + """Individual statement with reason and verdict for NLI evaluation.""" |
| 26 | + |
| 27 | + statement: str |
| 28 | + reason: str |
| 29 | + verdict: int |
| 30 | + |
| 31 | + |
| 32 | +class NLIStatementOutput(BaseModel): |
| 33 | + """Structured output for NLI statement evaluation.""" |
| 34 | + |
| 35 | + statements: List[StatementFaithfulnessAnswer] |
| 36 | + |
| 37 | + |
| 38 | +def claim_decomposition_prompt( |
| 39 | + response: str, atomicity: str = "low", coverage: str = "low" |
| 40 | +) -> str: |
| 41 | + """ |
| 42 | + V1-identical claim decomposition prompt with configurable atomicity/coverage. |
| 43 | +
|
| 44 | + Args: |
| 45 | + response: The response text to break down into claims |
| 46 | + atomicity: Level of atomicity ("low" or "high") |
| 47 | + coverage: Level of coverage ("low" or "high") |
| 48 | +
|
| 49 | + Returns: |
| 50 | + V1-identical prompt string for the LLM |
| 51 | + """ |
| 52 | + import json |
| 53 | + |
| 54 | + safe_response = json.dumps(response) |
| 55 | + |
| 56 | + # Select examples based on atomicity and coverage configuration |
| 57 | + if atomicity == "low" and coverage == "low": |
| 58 | + examples = [ |
| 59 | + { |
| 60 | + "input": { |
| 61 | + "response": "Charles Babbage was a French mathematician, philosopher, and food critic." |
| 62 | + }, |
| 63 | + "output": { |
| 64 | + "claims": ["Charles Babbage was a mathematician and philosopher."] |
| 65 | + }, |
| 66 | + }, |
| 67 | + { |
| 68 | + "input": { |
| 69 | + "response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics." |
| 70 | + }, |
| 71 | + "output": { |
| 72 | + "claims": [ |
| 73 | + "Albert Einstein was a German physicist.", |
| 74 | + "Albert Einstein developed relativity and contributed to quantum mechanics.", |
| 75 | + ] |
| 76 | + }, |
| 77 | + }, |
| 78 | + ] |
| 79 | + elif atomicity == "low" and coverage == "high": |
| 80 | + examples = [ |
| 81 | + { |
| 82 | + "input": { |
| 83 | + "response": "Charles Babbage was a French mathematician, philosopher, and food critic." |
| 84 | + }, |
| 85 | + "output": { |
| 86 | + "claims": [ |
| 87 | + "Charles Babbage was a French mathematician, philosopher, and food critic." |
| 88 | + ] |
| 89 | + }, |
| 90 | + }, |
| 91 | + { |
| 92 | + "input": { |
| 93 | + "response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics." |
| 94 | + }, |
| 95 | + "output": { |
| 96 | + "claims": [ |
| 97 | + "Albert Einstein was a German theoretical physicist.", |
| 98 | + "Albert Einstein developed the theory of relativity and also contributed to the development of quantum mechanics.", |
| 99 | + ] |
| 100 | + }, |
| 101 | + }, |
| 102 | + ] |
| 103 | + elif atomicity == "high" and coverage == "low": |
| 104 | + examples = [ |
| 105 | + { |
| 106 | + "input": { |
| 107 | + "response": "Charles Babbage was a French mathematician, philosopher, and food critic." |
| 108 | + }, |
| 109 | + "output": { |
| 110 | + "claims": [ |
| 111 | + "Charles Babbage was a mathematician.", |
| 112 | + "Charles Babbage was a philosopher.", |
| 113 | + ] |
| 114 | + }, |
| 115 | + }, |
| 116 | + { |
| 117 | + "input": { |
| 118 | + "response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics." |
| 119 | + }, |
| 120 | + "output": { |
| 121 | + "claims": [ |
| 122 | + "Albert Einstein was a German theoretical physicist.", |
| 123 | + "Albert Einstein developed the theory of relativity.", |
| 124 | + ] |
| 125 | + }, |
| 126 | + }, |
| 127 | + ] |
| 128 | + else: # high atomicity, high coverage |
| 129 | + examples = [ |
| 130 | + { |
| 131 | + "input": { |
| 132 | + "response": "Charles Babbage was a French mathematician, philosopher, and food critic." |
| 133 | + }, |
| 134 | + "output": { |
| 135 | + "claims": [ |
| 136 | + "Charles Babbage was a mathematician.", |
| 137 | + "Charles Babbage was a philosopher.", |
| 138 | + "Charles Babbage was a food critic.", |
| 139 | + "Charles Babbage was French.", |
| 140 | + ] |
| 141 | + }, |
| 142 | + }, |
| 143 | + { |
| 144 | + "input": { |
| 145 | + "response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics." |
| 146 | + }, |
| 147 | + "output": { |
| 148 | + "claims": [ |
| 149 | + "Albert Einstein was a German theoretical physicist.", |
| 150 | + "Albert Einstein developed the theory of relativity.", |
| 151 | + "Albert Einstein contributed to the development of quantum mechanics.", |
| 152 | + ] |
| 153 | + }, |
| 154 | + }, |
| 155 | + ] |
| 156 | + |
| 157 | + # Build examples string |
| 158 | + examples_str = "\n".join( |
| 159 | + [ |
| 160 | + f"""Example {i + 1} |
| 161 | +Input: {json.dumps(ex["input"], indent=4)} |
| 162 | +Output: {json.dumps(ex["output"], indent=4)}""" |
| 163 | + for i, ex in enumerate(examples) |
| 164 | + ] |
| 165 | + ) |
| 166 | + |
| 167 | + return f"""Decompose and break down each of the input sentences into one or more standalone statements. Each statement should be a standalone claim that can be independently verified. |
| 168 | +Follow the level of atomicity and coverage as shown in the examples. |
| 169 | +Please return the output in a JSON format that complies with the following schema as specified in JSON Schema: |
| 170 | +{{"properties": {{"claims": {{"description": "Decomposed Claims", "items": {{"type": "string"}}, "title": "Claims", "type": "array"}}}}, "required": ["claims"], "title": "ClaimDecompositionOutput", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash. |
| 171 | +
|
| 172 | +--------EXAMPLES----------- |
| 173 | +{examples_str} |
| 174 | +----------------------------- |
| 175 | +
|
| 176 | +Now perform the same with the following input |
| 177 | +input: {{ |
| 178 | + "response": {safe_response} |
| 179 | +}} |
| 180 | +Output: """ |
| 181 | + |
| 182 | + |
| 183 | +class FactualCorrectness(BaseMetric): |
| 184 | + """ |
| 185 | + Modern v2 implementation of factual correctness evaluation. |
| 186 | +
|
| 187 | + Evaluates the factual correctness of responses by comparing claims made in the response |
| 188 | + against a reference text. Uses claim decomposition and natural language inference (NLI) |
| 189 | + to verify claims in both directions. |
| 190 | +
|
| 191 | + The metric supports three evaluation modes: |
| 192 | + - Precision: What fraction of response claims are supported by reference |
| 193 | + - Recall: What fraction of reference claims are covered by response |
| 194 | + - F1: Harmonic mean of precision and recall (with configurable beta) |
| 195 | +
|
| 196 | + The metric also supports configurable claim decomposition: |
| 197 | + - Atomicity: "low" (fewer, broader claims) vs "high" (more, atomic claims) |
| 198 | + - Coverage: "low" (partial coverage) vs "high" (comprehensive coverage) |
| 199 | +
|
| 200 | + Usage: |
| 201 | + >>> import instructor |
| 202 | + >>> from openai import AsyncOpenAI |
| 203 | + >>> from ragas.llms.base import llm_factory |
| 204 | + >>> from ragas.metrics.collections import FactualCorrectness |
| 205 | + >>> |
| 206 | + >>> # Setup dependencies |
| 207 | + >>> client = AsyncOpenAI() |
| 208 | + >>> llm = llm_factory("gpt-4o-mini", client=client) |
| 209 | + >>> |
| 210 | + >>> # Create metric instance |
| 211 | + >>> metric = FactualCorrectness(llm=llm, mode="f1", beta=1.0) |
| 212 | + >>> |
| 213 | + >>> # Single evaluation |
| 214 | + >>> result = await metric.ascore( |
| 215 | + ... response="Einstein was born in Germany in 1879.", |
| 216 | + ... reference="Albert Einstein was born in Ulm, Germany on March 14, 1879." |
| 217 | + ... ) |
| 218 | + >>> print(f"Factual Correctness: {result.value}") |
| 219 | +
|
| 220 | + Attributes: |
| 221 | + llm: Modern instructor-based LLM for claim decomposition and NLI evaluation |
| 222 | + mode: Evaluation mode ("precision", "recall", or "f1") |
| 223 | + beta: Beta parameter for F1 score (>1 favors recall, <1 favors precision) |
| 224 | + atomicity: Claim decomposition atomicity ("low" or "high") |
| 225 | + coverage: Claim decomposition coverage ("low" or "high") |
| 226 | + name: The metric name |
| 227 | + allowed_values: Score range (0.0 to 1.0, higher is better) |
| 228 | + """ |
| 229 | + |
| 230 | + # Type hints for linter (attributes are set in __init__) |
| 231 | + llm: "InstructorBaseRagasLLM" |
| 232 | + |
| 233 | + def __init__( |
| 234 | + self, |
| 235 | + llm: "InstructorBaseRagasLLM", |
| 236 | + mode: t.Literal["precision", "recall", "f1"] = "f1", |
| 237 | + beta: float = 1.0, |
| 238 | + atomicity: t.Literal["low", "high"] = "low", |
| 239 | + coverage: t.Literal["low", "high"] = "low", |
| 240 | + name: str = "factual_correctness", |
| 241 | + **kwargs, |
| 242 | + ): |
| 243 | + """ |
| 244 | + Initialize FactualCorrectness metric with required components. |
| 245 | +
|
| 246 | + Args: |
| 247 | + llm: Modern instructor-based LLM for claim decomposition and NLI evaluation |
| 248 | + mode: Evaluation mode ("precision", "recall", or "f1") |
| 249 | + beta: Beta parameter for F1 score (>1 favors recall, <1 favors precision) |
| 250 | + atomicity: Claim decomposition atomicity ("low" or "high") |
| 251 | + coverage: Claim decomposition coverage ("low" or "high") |
| 252 | + name: The metric name |
| 253 | + """ |
| 254 | + # Set attributes explicitly before calling super() |
| 255 | + self.llm = llm |
| 256 | + self.mode = mode |
| 257 | + self.beta = beta |
| 258 | + self.atomicity = atomicity |
| 259 | + self.coverage = coverage |
| 260 | + |
| 261 | + # Validate beta parameter |
| 262 | + if not isinstance(beta, (int, float)): |
| 263 | + raise ValueError( |
| 264 | + "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision." |
| 265 | + ) |
| 266 | + |
| 267 | + # Call super() for validation (without passing llm in kwargs) |
| 268 | + super().__init__(name=name, **kwargs) |
| 269 | + |
| 270 | + async def ascore(self, response: str, reference: str) -> MetricResult: |
| 271 | + """ |
| 272 | + Calculate factual correctness score. |
| 273 | +
|
| 274 | + Args: |
| 275 | + response: The response to evaluate for factual correctness |
| 276 | + reference: The reference text to check claims against |
| 277 | +
|
| 278 | + Returns: |
| 279 | + MetricResult with factual correctness score (0.0-1.0, higher is better) |
| 280 | + """ |
| 281 | + # Input validation |
| 282 | + if not response: |
| 283 | + raise ValueError( |
| 284 | + "response is missing. Please add response to the test sample." |
| 285 | + ) |
| 286 | + if not reference: |
| 287 | + raise ValueError( |
| 288 | + "reference is missing. Please add reference to the test sample." |
| 289 | + ) |
| 290 | + |
| 291 | + # Step 1: Get claim verifications based on mode |
| 292 | + if self.mode != "precision": |
| 293 | + # For recall and f1: response claims → reference verification |
| 294 | + response_verified = await self._decompose_and_verify_claims( |
| 295 | + response, reference |
| 296 | + ) |
| 297 | + else: |
| 298 | + response_verified = np.array([], dtype=bool) |
| 299 | + |
| 300 | + if self.mode != "recall": |
| 301 | + # For precision and f1: reference claims → response verification |
| 302 | + reference_verified = await self._decompose_and_verify_claims( |
| 303 | + reference, response |
| 304 | + ) |
| 305 | + else: |
| 306 | + reference_verified = np.array([], dtype=bool) |
| 307 | + |
| 308 | + # Step 2: Compute TP, FP, FN |
| 309 | + if self.mode != "precision": |
| 310 | + tp = int(np.sum(response_verified)) |
| 311 | + fn = int(np.sum(~response_verified)) |
| 312 | + else: |
| 313 | + tp = int(np.sum(reference_verified)) |
| 314 | + fn = 0 |
| 315 | + |
| 316 | + if self.mode != "recall": |
| 317 | + fp = int(np.sum(~reference_verified)) |
| 318 | + else: |
| 319 | + fp = 0 |
| 320 | + |
| 321 | + # Step 3: Compute final score based on mode |
| 322 | + if self.mode == "precision": |
| 323 | + score = tp / (tp + fp + 1e-8) |
| 324 | + elif self.mode == "recall": |
| 325 | + score = tp / (tp + fn + 1e-8) |
| 326 | + else: # f1 |
| 327 | + score = fbeta_score(tp, fp, fn, self.beta) |
| 328 | + |
| 329 | + return MetricResult(value=float(np.round(score, 2))) |
| 330 | + |
| 331 | + async def _decompose_claims(self, response: str) -> List[str]: |
| 332 | + """Break response into claims using configurable decomposition.""" |
| 333 | + prompt = claim_decomposition_prompt( |
| 334 | + response, atomicity=self.atomicity, coverage=self.coverage |
| 335 | + ) |
| 336 | + result = await self.llm.agenerate(prompt, ClaimDecompositionOutput) |
| 337 | + return result.claims |
| 338 | + |
| 339 | + async def _verify_claims( |
| 340 | + self, claims: List[str], reference: str |
| 341 | + ) -> NLIStatementOutput: |
| 342 | + """Verify claims against reference using NLI.""" |
| 343 | + prompt = nli_statement_prompt(reference, claims) |
| 344 | + result = await self.llm.agenerate(prompt, NLIStatementOutput) |
| 345 | + return result |
| 346 | + |
| 347 | + async def _decompose_and_verify_claims( |
| 348 | + self, text_to_decompose: str, reference_text: str |
| 349 | + ) -> np.ndarray: |
| 350 | + """Decompose text into claims and verify against reference.""" |
| 351 | + claims = await self._decompose_claims(text_to_decompose) |
| 352 | + if not claims: |
| 353 | + return np.array([], dtype=bool) |
| 354 | + |
| 355 | + verdicts = await self._verify_claims(claims, reference_text) |
| 356 | + if not verdicts.statements: |
| 357 | + return np.array([], dtype=bool) |
| 358 | + |
| 359 | + return np.array([bool(stmt.verdict) for stmt in verdicts.statements]) |
0 commit comments