|
| 1 | +"""Factual Correctness metric v2 - Modern implementation with multi-modal scoring.""" |
| 2 | + |
| 3 | +import typing as t |
| 4 | +from typing import List |
| 5 | + |
| 6 | +import numpy as np |
| 7 | +from pydantic import BaseModel |
| 8 | + |
| 9 | +from ragas.metrics.collections.base import BaseMetric |
| 10 | +from ragas.metrics.result import MetricResult |
| 11 | +from ragas.metrics.utils import fbeta_score |
| 12 | +from ragas.prompt.metrics.common import nli_statement_prompt |
| 13 | +from ragas.prompt.metrics.factual_correctness import claim_decomposition_prompt |
| 14 | + |
| 15 | +if t.TYPE_CHECKING: |
| 16 | + from ragas.llms.base import InstructorBaseRagasLLM |
| 17 | + |
| 18 | + |
| 19 | +class ClaimDecompositionOutput(BaseModel): |
| 20 | + """Structured output for claim decomposition.""" |
| 21 | + |
| 22 | + claims: List[str] |
| 23 | + |
| 24 | + |
| 25 | +class StatementFaithfulnessAnswer(BaseModel): |
| 26 | + """Individual statement with reason and verdict for NLI evaluation.""" |
| 27 | + |
| 28 | + statement: str |
| 29 | + reason: str |
| 30 | + verdict: int |
| 31 | + |
| 32 | + |
| 33 | +class NLIStatementOutput(BaseModel): |
| 34 | + """Structured output for NLI statement evaluation.""" |
| 35 | + |
| 36 | + statements: List[StatementFaithfulnessAnswer] |
| 37 | + |
| 38 | + |
| 39 | +class FactualCorrectness(BaseMetric): |
| 40 | + """ |
| 41 | + Modern v2 implementation of factual correctness evaluation. |
| 42 | +
|
| 43 | + Evaluates the factual correctness of responses by comparing claims made in the response |
| 44 | + against a reference text. Uses claim decomposition and natural language inference (NLI) |
| 45 | + to verify claims in both directions. |
| 46 | +
|
| 47 | + The metric supports three evaluation modes: |
| 48 | + - Precision: What fraction of response claims are supported by reference |
| 49 | + - Recall: What fraction of reference claims are covered by response |
| 50 | + - F1: Harmonic mean of precision and recall (with configurable beta) |
| 51 | +
|
| 52 | + The metric also supports configurable claim decomposition: |
| 53 | + - Atomicity: "low" (fewer, broader claims) vs "high" (more, atomic claims) |
| 54 | + - Coverage: "low" (partial coverage) vs "high" (comprehensive coverage) |
| 55 | +
|
| 56 | + Usage: |
| 57 | + >>> import instructor |
| 58 | + >>> from openai import AsyncOpenAI |
| 59 | + >>> from ragas.llms.base import llm_factory |
| 60 | + >>> from ragas.metrics.collections import FactualCorrectness |
| 61 | + >>> |
| 62 | + >>> # Setup dependencies |
| 63 | + >>> client = AsyncOpenAI() |
| 64 | + >>> llm = llm_factory("gpt-4o-mini", client=client) |
| 65 | + >>> |
| 66 | + >>> # Create metric instance |
| 67 | + >>> metric = FactualCorrectness(llm=llm, mode="f1", beta=1.0) |
| 68 | + >>> |
| 69 | + >>> # Single evaluation |
| 70 | + >>> result = await metric.ascore( |
| 71 | + ... response="Einstein was born in Germany in 1879.", |
| 72 | + ... reference="Albert Einstein was born in Ulm, Germany on March 14, 1879." |
| 73 | + ... ) |
| 74 | + >>> print(f"Factual Correctness: {result.value}") |
| 75 | +
|
| 76 | + Attributes: |
| 77 | + llm: Modern instructor-based LLM for claim decomposition and NLI evaluation |
| 78 | + mode: Evaluation mode ("precision", "recall", or "f1") |
| 79 | + beta: Beta parameter for F1 score (>1 favors recall, <1 favors precision) |
| 80 | + atomicity: Claim decomposition atomicity ("low" or "high") |
| 81 | + coverage: Claim decomposition coverage ("low" or "high") |
| 82 | + name: The metric name |
| 83 | + allowed_values: Score range (0.0 to 1.0, higher is better) |
| 84 | + """ |
| 85 | + |
| 86 | + # Type hints for linter (attributes are set in __init__) |
| 87 | + llm: "InstructorBaseRagasLLM" |
| 88 | + |
| 89 | + def __init__( |
| 90 | + self, |
| 91 | + llm: "InstructorBaseRagasLLM", |
| 92 | + mode: t.Literal["precision", "recall", "f1"] = "f1", |
| 93 | + beta: float = 1.0, |
| 94 | + atomicity: t.Literal["low", "high"] = "low", |
| 95 | + coverage: t.Literal["low", "high"] = "low", |
| 96 | + name: str = "factual_correctness", |
| 97 | + **kwargs, |
| 98 | + ): |
| 99 | + """ |
| 100 | + Initialize FactualCorrectness metric with required components. |
| 101 | +
|
| 102 | + Args: |
| 103 | + llm: Modern instructor-based LLM for claim decomposition and NLI evaluation |
| 104 | + mode: Evaluation mode ("precision", "recall", or "f1") |
| 105 | + beta: Beta parameter for F1 score (>1 favors recall, <1 favors precision) |
| 106 | + atomicity: Claim decomposition atomicity ("low" or "high") |
| 107 | + coverage: Claim decomposition coverage ("low" or "high") |
| 108 | + name: The metric name |
| 109 | + """ |
| 110 | + # Set attributes explicitly before calling super() |
| 111 | + self.llm = llm |
| 112 | + self.mode = mode |
| 113 | + self.beta = beta |
| 114 | + self.atomicity = atomicity |
| 115 | + self.coverage = coverage |
| 116 | + |
| 117 | + # Validate beta parameter |
| 118 | + if not isinstance(beta, (int, float)): |
| 119 | + raise ValueError( |
| 120 | + "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision." |
| 121 | + ) |
| 122 | + |
| 123 | + # Call super() for validation (without passing llm in kwargs) |
| 124 | + super().__init__(name=name, **kwargs) |
| 125 | + |
| 126 | + async def ascore(self, response: str, reference: str) -> MetricResult: |
| 127 | + """ |
| 128 | + Calculate factual correctness score. |
| 129 | +
|
| 130 | + Args: |
| 131 | + response: The response to evaluate for factual correctness |
| 132 | + reference: The reference text to check claims against |
| 133 | +
|
| 134 | + Returns: |
| 135 | + MetricResult with factual correctness score (0.0-1.0, higher is better) |
| 136 | + """ |
| 137 | + # Input validation |
| 138 | + if not response: |
| 139 | + raise ValueError( |
| 140 | + "response is missing. Please add response to the test sample." |
| 141 | + ) |
| 142 | + if not reference: |
| 143 | + raise ValueError( |
| 144 | + "reference is missing. Please add reference to the test sample." |
| 145 | + ) |
| 146 | + |
| 147 | + # Step 1: Get claim verifications to match legacy behavior exactly |
| 148 | + # Legacy always does: decompose response → verify against reference |
| 149 | + reference_response = await self._decompose_and_verify_claims( |
| 150 | + response, reference |
| 151 | + ) |
| 152 | + |
| 153 | + if self.mode != "precision": |
| 154 | + # For recall and f1, also do: decompose reference → verify against response |
| 155 | + response_reference = await self._decompose_and_verify_claims( |
| 156 | + reference, response |
| 157 | + ) |
| 158 | + else: |
| 159 | + response_reference = np.array([], dtype=bool) |
| 160 | + |
| 161 | + # Step 2: Compute TP, FP, FN exactly like legacy |
| 162 | + tp = int(np.sum(reference_response)) |
| 163 | + fp = int(np.sum(~reference_response)) |
| 164 | + if self.mode != "precision": |
| 165 | + fn = int(np.sum(~response_reference)) |
| 166 | + else: |
| 167 | + fn = 0 |
| 168 | + |
| 169 | + # Step 3: Compute final score based on mode |
| 170 | + if self.mode == "precision": |
| 171 | + score = tp / (tp + fp + 1e-8) |
| 172 | + elif self.mode == "recall": |
| 173 | + score = tp / (tp + fn + 1e-8) |
| 174 | + else: # f1 |
| 175 | + score = fbeta_score(tp, fp, fn, self.beta) |
| 176 | + |
| 177 | + return MetricResult(value=float(np.round(score, 2))) |
| 178 | + |
| 179 | + async def _decompose_claims(self, response: str) -> List[str]: |
| 180 | + """Break response into claims using configurable decomposition.""" |
| 181 | + prompt = claim_decomposition_prompt( |
| 182 | + response, atomicity=self.atomicity, coverage=self.coverage |
| 183 | + ) |
| 184 | + result = await self.llm.agenerate(prompt, ClaimDecompositionOutput) |
| 185 | + return result.claims |
| 186 | + |
| 187 | + async def _verify_claims( |
| 188 | + self, claims: List[str], reference: str |
| 189 | + ) -> NLIStatementOutput: |
| 190 | + """Verify claims against reference using NLI.""" |
| 191 | + prompt = nli_statement_prompt(reference, claims) |
| 192 | + result = await self.llm.agenerate(prompt, NLIStatementOutput) |
| 193 | + return result |
| 194 | + |
| 195 | + async def _decompose_and_verify_claims( |
| 196 | + self, text_to_decompose: str, reference_text: str |
| 197 | + ) -> np.ndarray: |
| 198 | + """Decompose text into claims and verify against reference.""" |
| 199 | + claims = await self._decompose_claims(text_to_decompose) |
| 200 | + if not claims: |
| 201 | + return np.array([], dtype=bool) |
| 202 | + |
| 203 | + verdicts = await self._verify_claims(claims, reference_text) |
| 204 | + if not verdicts.statements: |
| 205 | + return np.array([], dtype=bool) |
| 206 | + |
| 207 | + return np.array([bool(stmt.verdict) for stmt in verdicts.statements]) |
0 commit comments