Skip to content

Commit 09464cf

Browse files
authored
Migrate Context precision with + without ref (#2398)
1 parent b2b28d7 commit 09464cf

File tree

6 files changed

+858
-0
lines changed

6 files changed

+858
-0
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@
66
from ragas.metrics.collections._answer_similarity import AnswerSimilarity
77
from ragas.metrics.collections._bleu_score import BleuScore
88
from ragas.metrics.collections._context_entity_recall import ContextEntityRecall
9+
from ragas.metrics.collections._context_precision import (
10+
ContextPrecision,
11+
ContextPrecisionWithoutReference,
12+
ContextPrecisionWithReference,
13+
ContextUtilization,
14+
)
915
from ragas.metrics.collections._context_relevance import ContextRelevance
1016
from ragas.metrics.collections._faithfulness import Faithfulness
1117
from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
@@ -28,7 +34,11 @@
2834
"AnswerSimilarity",
2935
"BleuScore",
3036
"ContextEntityRecall",
37+
"ContextPrecision",
38+
"ContextPrecisionWithReference",
39+
"ContextPrecisionWithoutReference",
3140
"ContextRelevance",
41+
"ContextUtilization",
3242
"DistanceMeasure",
3343
"ExactMatch",
3444
"Faithfulness",
Lines changed: 330 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,330 @@
1+
"""Context Precision metrics v2 - Modern implementation with function-based prompts."""
2+
3+
import typing as t
4+
from typing import List
5+
6+
import numpy as np
7+
from pydantic import BaseModel
8+
9+
from ragas.metrics.collections.base import BaseMetric
10+
from ragas.metrics.result import MetricResult
11+
from ragas.prompt.metrics.context_precision import (
12+
context_precision_with_reference_prompt,
13+
context_precision_without_reference_prompt,
14+
)
15+
16+
if t.TYPE_CHECKING:
17+
from ragas.llms.base import InstructorBaseRagasLLM
18+
19+
20+
class ContextPrecisionOutput(BaseModel):
21+
"""Structured output for context precision evaluation."""
22+
23+
reason: str
24+
verdict: int
25+
26+
27+
class ContextPrecisionWithReference(BaseMetric):
28+
"""
29+
Modern v2 implementation of context precision with reference.
30+
31+
Evaluates whether retrieved contexts are useful for answering a question by comparing
32+
each context against a reference answer. The metric calculates average precision
33+
based on the usefulness verdicts from an LLM.
34+
35+
This implementation uses modern instructor LLMs with structured output.
36+
Only supports modern components - legacy wrappers are rejected with clear error messages.
37+
38+
Usage:
39+
>>> import openai
40+
>>> from ragas.llms.base import llm_factory
41+
>>> from ragas.metrics.collections import ContextPrecisionWithReference
42+
>>>
43+
>>> # Setup dependencies
44+
>>> client = openai.AsyncOpenAI()
45+
>>> llm = llm_factory("gpt-4o-mini", client=client)
46+
>>>
47+
>>> # Create metric instance
48+
>>> metric = ContextPrecisionWithReference(llm=llm)
49+
>>>
50+
>>> # Single evaluation
51+
>>> result = await metric.ascore(
52+
... user_input="What is the capital of France?",
53+
... reference="Paris is the capital of France.",
54+
... retrieved_contexts=["Paris is the capital and largest city of France.", "Berlin is the capital of Germany."]
55+
... )
56+
>>> print(f"Context Precision: {result.value}")
57+
58+
Attributes:
59+
llm: Modern instructor-based LLM for context evaluation
60+
name: The metric name
61+
allowed_values: Score range (0.0 to 1.0, higher is better)
62+
"""
63+
64+
# Type hints for linter (attributes are set in __init__)
65+
llm: "InstructorBaseRagasLLM"
66+
67+
def __init__(
68+
self,
69+
llm: "InstructorBaseRagasLLM",
70+
name: str = "context_precision_with_reference",
71+
**kwargs,
72+
):
73+
"""
74+
Initialize ContextPrecisionWithReference metric with required components.
75+
76+
Args:
77+
llm: Modern instructor-based LLM for context evaluation
78+
name: The metric name
79+
"""
80+
# Set attributes explicitly before calling super()
81+
self.llm = llm
82+
83+
# Call super() for validation (without passing llm in kwargs)
84+
super().__init__(name=name, **kwargs)
85+
86+
async def ascore(
87+
self, user_input: str, reference: str, retrieved_contexts: List[str]
88+
) -> MetricResult:
89+
"""
90+
Calculate context precision score using reference.
91+
92+
Args:
93+
user_input: The question being asked
94+
reference: The reference answer to compare against
95+
retrieved_contexts: The retrieved contexts to evaluate
96+
97+
Returns:
98+
MetricResult with context precision score (0.0-1.0, higher is better)
99+
"""
100+
# Input validation
101+
if not user_input:
102+
raise ValueError("user_input cannot be empty")
103+
if not reference:
104+
raise ValueError("reference cannot be empty")
105+
if not retrieved_contexts:
106+
raise ValueError("retrieved_contexts cannot be empty")
107+
108+
# Evaluate each retrieved context
109+
verdicts = []
110+
for context in retrieved_contexts:
111+
prompt = context_precision_with_reference_prompt(
112+
user_input, context, reference
113+
)
114+
result = await self.llm.agenerate(prompt, ContextPrecisionOutput)
115+
verdicts.append(result.verdict)
116+
117+
# Calculate average precision
118+
score = self._calculate_average_precision(verdicts)
119+
return MetricResult(value=float(score))
120+
121+
def _calculate_average_precision(self, verdicts: List[int]) -> float:
122+
"""Calculate average precision from binary verdicts. Matches legacy logic exactly."""
123+
verdict_list = verdicts
124+
denominator = sum(verdict_list) + 1e-10
125+
numerator = sum(
126+
[
127+
(sum(verdict_list[: i + 1]) / (i + 1)) * verdict_list[i]
128+
for i in range(len(verdict_list))
129+
]
130+
)
131+
score = numerator / denominator
132+
133+
if np.isnan(score):
134+
# Match legacy warning behavior
135+
import logging
136+
137+
logging.warning(
138+
"Invalid response format. Expected a list of dictionaries with keys 'verdict'"
139+
)
140+
141+
return score
142+
143+
144+
class ContextPrecisionWithoutReference(BaseMetric):
145+
"""
146+
Modern v2 implementation of context precision without reference.
147+
148+
Evaluates whether retrieved contexts are useful for answering a question by comparing
149+
each context against the generated response. The metric calculates average precision
150+
based on the usefulness verdicts from an LLM.
151+
152+
This implementation uses modern instructor LLMs with structured output.
153+
Only supports modern components - legacy wrappers are rejected with clear error messages.
154+
155+
Usage:
156+
>>> import openai
157+
>>> from ragas.llms.base import llm_factory
158+
>>> from ragas.metrics.collections import ContextPrecisionWithoutReference
159+
>>>
160+
>>> # Setup dependencies
161+
>>> client = openai.AsyncOpenAI()
162+
>>> llm = llm_factory("gpt-4o-mini", client=client)
163+
>>>
164+
>>> # Create metric instance
165+
>>> metric = ContextPrecisionWithoutReference(llm=llm)
166+
>>>
167+
>>> # Single evaluation
168+
>>> result = await metric.ascore(
169+
... user_input="What is the capital of France?",
170+
... response="Paris is the capital of France.",
171+
... retrieved_contexts=["Paris is the capital and largest city of France.", "Berlin is the capital of Germany."]
172+
... )
173+
>>> print(f"Context Precision: {result.value}")
174+
175+
Attributes:
176+
llm: Modern instructor-based LLM for context evaluation
177+
name: The metric name
178+
allowed_values: Score range (0.0 to 1.0, higher is better)
179+
"""
180+
181+
# Type hints for linter (attributes are set in __init__)
182+
llm: "InstructorBaseRagasLLM"
183+
184+
def __init__(
185+
self,
186+
llm: "InstructorBaseRagasLLM",
187+
name: str = "context_precision_without_reference",
188+
**kwargs,
189+
):
190+
"""
191+
Initialize ContextPrecisionWithoutReference metric with required components.
192+
193+
Args:
194+
llm: Modern instructor-based LLM for context evaluation
195+
name: The metric name
196+
"""
197+
# Set attributes explicitly before calling super()
198+
self.llm = llm
199+
200+
# Call super() for validation (without passing llm in kwargs)
201+
super().__init__(name=name, **kwargs)
202+
203+
async def ascore(
204+
self, user_input: str, response: str, retrieved_contexts: List[str]
205+
) -> MetricResult:
206+
"""
207+
Calculate context precision score using response.
208+
209+
Args:
210+
user_input: The question being asked
211+
response: The response that was generated
212+
retrieved_contexts: The retrieved contexts to evaluate
213+
214+
Returns:
215+
MetricResult with context precision score (0.0-1.0, higher is better)
216+
"""
217+
# Input validation
218+
if not user_input:
219+
raise ValueError("user_input cannot be empty")
220+
if not response:
221+
raise ValueError("response cannot be empty")
222+
if not retrieved_contexts:
223+
raise ValueError("retrieved_contexts cannot be empty")
224+
225+
# Evaluate each retrieved context
226+
verdicts = []
227+
for context in retrieved_contexts:
228+
prompt = context_precision_without_reference_prompt(
229+
user_input, context, response
230+
)
231+
result = await self.llm.agenerate(prompt, ContextPrecisionOutput)
232+
verdicts.append(result.verdict)
233+
234+
# Calculate average precision
235+
score = self._calculate_average_precision(verdicts)
236+
return MetricResult(value=float(score))
237+
238+
def _calculate_average_precision(self, verdicts: List[int]) -> float:
239+
"""Calculate average precision from binary verdicts. Matches legacy logic exactly."""
240+
verdict_list = verdicts
241+
denominator = sum(verdict_list) + 1e-10
242+
numerator = sum(
243+
[
244+
(sum(verdict_list[: i + 1]) / (i + 1)) * verdict_list[i]
245+
for i in range(len(verdict_list))
246+
]
247+
)
248+
score = numerator / denominator
249+
250+
if np.isnan(score):
251+
# Match legacy warning behavior
252+
import logging
253+
254+
logging.warning(
255+
"Invalid response format. Expected a list of dictionaries with keys 'verdict'"
256+
)
257+
258+
return score
259+
260+
261+
class ContextPrecision(ContextPrecisionWithReference):
262+
"""
263+
Modern v2 wrapper for ContextPrecisionWithReference with shorter name.
264+
265+
This is a simple wrapper that provides the legacy "context_precision" name
266+
while using the modern V2 implementation underneath.
267+
268+
Usage:
269+
>>> import openai
270+
>>> from ragas.llms.base import llm_factory
271+
>>> from ragas.metrics.collections import ContextPrecision
272+
>>>
273+
>>> # Setup dependencies
274+
>>> client = openai.AsyncOpenAI()
275+
>>> llm = llm_factory("gpt-4o-mini", client=client)
276+
>>>
277+
>>> # Create metric instance (same as ContextPrecisionWithReference)
278+
>>> metric = ContextPrecision(llm=llm)
279+
>>>
280+
>>> # Single evaluation
281+
>>> result = await metric.ascore(
282+
... user_input="What is the capital of France?",
283+
... reference="Paris is the capital of France.",
284+
... retrieved_contexts=["Paris is the capital and largest city of France."]
285+
... )
286+
"""
287+
288+
def __init__(
289+
self,
290+
llm: "InstructorBaseRagasLLM",
291+
**kwargs,
292+
):
293+
"""Initialize ContextPrecision with the legacy default name."""
294+
super().__init__(llm, name="context_precision", **kwargs)
295+
296+
297+
class ContextUtilization(ContextPrecisionWithoutReference):
298+
"""
299+
Modern v2 wrapper for ContextPrecisionWithoutReference with shorter name.
300+
301+
This is a simple wrapper that provides the legacy "context_utilization" name
302+
while using the modern V2 implementation underneath.
303+
304+
Usage:
305+
>>> import openai
306+
>>> from ragas.llms.base import llm_factory
307+
>>> from ragas.metrics.collections import ContextUtilization
308+
>>>
309+
>>> # Setup dependencies
310+
>>> client = openai.AsyncOpenAI()
311+
>>> llm = llm_factory("gpt-4o-mini", client=client)
312+
>>>
313+
>>> # Create metric instance (same as ContextPrecisionWithoutReference)
314+
>>> metric = ContextUtilization(llm=llm)
315+
>>>
316+
>>> # Single evaluation
317+
>>> result = await metric.ascore(
318+
... user_input="What is the capital of France?",
319+
... response="Paris is the capital of France.",
320+
... retrieved_contexts=["Paris is the capital and largest city of France."]
321+
... )
322+
"""
323+
324+
def __init__(
325+
self,
326+
llm: "InstructorBaseRagasLLM",
327+
**kwargs,
328+
):
329+
"""Initialize ContextUtilization with the legacy default name."""
330+
super().__init__(llm, name="context_utilization", **kwargs)

src/ragas/prompt/metrics/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,17 @@
33
from ragas.prompt.metrics.answer_correctness import correctness_classifier_prompt
44
from ragas.prompt.metrics.answer_relevance import answer_relevancy_prompt
55
from ragas.prompt.metrics.common import nli_statement_prompt, statement_generator_prompt
6+
from ragas.prompt.metrics.context_precision import (
7+
context_precision_prompt,
8+
context_precision_with_reference_prompt,
9+
context_precision_without_reference_prompt,
10+
)
611

712
__all__ = [
813
"answer_relevancy_prompt",
14+
"context_precision_prompt",
15+
"context_precision_with_reference_prompt",
16+
"context_precision_without_reference_prompt",
917
"correctness_classifier_prompt",
1018
"nli_statement_prompt",
1119
"statement_generator_prompt",

0 commit comments

Comments
 (0)