Skip to content

Commit 33e075c

Browse files
committed
Migate Context Pricision with + without ref
1 parent 6105b31 commit 33e075c

File tree

5 files changed

+633
-0
lines changed

5 files changed

+633
-0
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
)
1515
from ragas.metrics.collections._bleu_score import BleuScore
1616
from ragas.metrics.collections._context_entity_recall import ContextEntityRecall
17+
from ragas.metrics.collections._context_precision import (
18+
ContextPrecisionWithoutReference,
19+
ContextPrecisionWithReference,
20+
)
1721
from ragas.metrics.collections._faithfulness import Faithfulness
1822
from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
1923
from ragas.metrics.collections._rouge_score import RougeScore
@@ -37,6 +41,8 @@
3741
"AspectCritic",
3842
"BleuScore",
3943
"ContextEntityRecall",
44+
"ContextPrecisionWithReference",
45+
"ContextPrecisionWithoutReference",
4046
"ContextRelevance",
4147
"DistanceMeasure",
4248
"ExactMatch",
Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
"""Context Precision metrics v2 - Modern implementation with function-based prompts."""
2+
3+
import typing as t
4+
from typing import List
5+
6+
import numpy as np
7+
from pydantic import BaseModel
8+
9+
from ragas.metrics.collections.base import BaseMetric
10+
from ragas.metrics.result import MetricResult
11+
from ragas.prompt.metrics.context_precision import (
12+
context_precision_with_reference_prompt,
13+
context_precision_without_reference_prompt,
14+
)
15+
16+
if t.TYPE_CHECKING:
17+
from ragas.llms.base import InstructorBaseRagasLLM
18+
19+
20+
class ContextPrecisionOutput(BaseModel):
21+
"""Structured output for context precision evaluation."""
22+
23+
reason: str
24+
verdict: int
25+
26+
27+
class ContextPrecisionWithReference(BaseMetric):
28+
"""
29+
Modern v2 implementation of context precision with reference.
30+
31+
Evaluates whether retrieved contexts are useful for answering a question by comparing
32+
each context against a reference answer. The metric calculates average precision
33+
based on the usefulness verdicts from an LLM.
34+
35+
This implementation uses modern instructor LLMs with structured output.
36+
Only supports modern components - legacy wrappers are rejected with clear error messages.
37+
38+
Usage:
39+
>>> import openai
40+
>>> from ragas.llms.base import llm_factory
41+
>>> from ragas.metrics.collections import ContextPrecisionWithReference
42+
>>>
43+
>>> # Setup dependencies
44+
>>> client = openai.AsyncOpenAI()
45+
>>> llm = llm_factory("gpt-4o-mini", client=client)
46+
>>>
47+
>>> # Create metric instance
48+
>>> metric = ContextPrecisionWithReference(llm=llm)
49+
>>>
50+
>>> # Single evaluation
51+
>>> result = await metric.ascore(
52+
... user_input="What is the capital of France?",
53+
... reference="Paris is the capital of France.",
54+
... retrieved_contexts=["Paris is the capital and largest city of France.", "Berlin is the capital of Germany."]
55+
... )
56+
>>> print(f"Context Precision: {result.value}")
57+
58+
Attributes:
59+
llm: Modern instructor-based LLM for context evaluation
60+
name: The metric name
61+
allowed_values: Score range (0.0 to 1.0, higher is better)
62+
"""
63+
64+
# Type hints for linter (attributes are set in __init__)
65+
llm: "InstructorBaseRagasLLM"
66+
67+
def __init__(
68+
self,
69+
llm: "InstructorBaseRagasLLM",
70+
name: str = "context_precision_with_reference",
71+
**kwargs,
72+
):
73+
"""
74+
Initialize ContextPrecisionWithReference metric with required components.
75+
76+
Args:
77+
llm: Modern instructor-based LLM for context evaluation
78+
name: The metric name
79+
"""
80+
# Set attributes explicitly before calling super()
81+
self.llm = llm
82+
83+
# Call super() for validation (without passing llm in kwargs)
84+
super().__init__(name=name, **kwargs)
85+
86+
async def ascore(
87+
self, user_input: str, reference: str, retrieved_contexts: List[str]
88+
) -> MetricResult:
89+
"""
90+
Calculate context precision score using reference.
91+
92+
Args:
93+
user_input: The question being asked
94+
reference: The reference answer to compare against
95+
retrieved_contexts: The retrieved contexts to evaluate
96+
97+
Returns:
98+
MetricResult with context precision score (0.0-1.0, higher is better)
99+
"""
100+
# Input validation
101+
if not user_input:
102+
raise ValueError("user_input cannot be empty")
103+
if not reference:
104+
raise ValueError("reference cannot be empty")
105+
if not retrieved_contexts:
106+
raise ValueError("retrieved_contexts cannot be empty")
107+
108+
# Evaluate each retrieved context
109+
verdicts = []
110+
for context in retrieved_contexts:
111+
prompt = context_precision_with_reference_prompt(
112+
user_input, context, reference
113+
)
114+
result = await self.llm.agenerate(prompt, ContextPrecisionOutput)
115+
verdicts.append(result.verdict)
116+
117+
# Calculate average precision
118+
score = self._calculate_average_precision(verdicts)
119+
return MetricResult(value=float(score))
120+
121+
def _calculate_average_precision(self, verdicts: List[int]) -> float:
122+
"""Calculate average precision from binary verdicts. Matches legacy logic exactly."""
123+
verdict_list = verdicts
124+
denominator = sum(verdict_list) + 1e-10
125+
numerator = sum(
126+
[
127+
(sum(verdict_list[: i + 1]) / (i + 1)) * verdict_list[i]
128+
for i in range(len(verdict_list))
129+
]
130+
)
131+
score = numerator / denominator
132+
133+
if np.isnan(score):
134+
# Match legacy warning behavior
135+
import logging
136+
137+
logging.warning(
138+
"Invalid response format. Expected a list of dictionaries with keys 'verdict'"
139+
)
140+
141+
return score
142+
143+
144+
class ContextPrecisionWithoutReference(BaseMetric):
145+
"""
146+
Modern v2 implementation of context precision without reference.
147+
148+
Evaluates whether retrieved contexts are useful for answering a question by comparing
149+
each context against the generated response. The metric calculates average precision
150+
based on the usefulness verdicts from an LLM.
151+
152+
This implementation uses modern instructor LLMs with structured output.
153+
Only supports modern components - legacy wrappers are rejected with clear error messages.
154+
155+
Usage:
156+
>>> import openai
157+
>>> from ragas.llms.base import llm_factory
158+
>>> from ragas.metrics.collections import ContextPrecisionWithoutReference
159+
>>>
160+
>>> # Setup dependencies
161+
>>> client = openai.AsyncOpenAI()
162+
>>> llm = llm_factory("gpt-4o-mini", client=client)
163+
>>>
164+
>>> # Create metric instance
165+
>>> metric = ContextPrecisionWithoutReference(llm=llm)
166+
>>>
167+
>>> # Single evaluation
168+
>>> result = await metric.ascore(
169+
... user_input="What is the capital of France?",
170+
... response="Paris is the capital of France.",
171+
... retrieved_contexts=["Paris is the capital and largest city of France.", "Berlin is the capital of Germany."]
172+
... )
173+
>>> print(f"Context Precision: {result.value}")
174+
175+
Attributes:
176+
llm: Modern instructor-based LLM for context evaluation
177+
name: The metric name
178+
allowed_values: Score range (0.0 to 1.0, higher is better)
179+
"""
180+
181+
# Type hints for linter (attributes are set in __init__)
182+
llm: "InstructorBaseRagasLLM"
183+
184+
def __init__(
185+
self,
186+
llm: "InstructorBaseRagasLLM",
187+
name: str = "context_precision_without_reference",
188+
**kwargs,
189+
):
190+
"""
191+
Initialize ContextPrecisionWithoutReference metric with required components.
192+
193+
Args:
194+
llm: Modern instructor-based LLM for context evaluation
195+
name: The metric name
196+
"""
197+
# Set attributes explicitly before calling super()
198+
self.llm = llm
199+
200+
# Call super() for validation (without passing llm in kwargs)
201+
super().__init__(name=name, **kwargs)
202+
203+
async def ascore(
204+
self, user_input: str, response: str, retrieved_contexts: List[str]
205+
) -> MetricResult:
206+
"""
207+
Calculate context precision score using response.
208+
209+
Args:
210+
user_input: The question being asked
211+
response: The response that was generated
212+
retrieved_contexts: The retrieved contexts to evaluate
213+
214+
Returns:
215+
MetricResult with context precision score (0.0-1.0, higher is better)
216+
"""
217+
# Input validation
218+
if not user_input:
219+
raise ValueError("user_input cannot be empty")
220+
if not response:
221+
raise ValueError("response cannot be empty")
222+
if not retrieved_contexts:
223+
raise ValueError("retrieved_contexts cannot be empty")
224+
225+
# Evaluate each retrieved context
226+
verdicts = []
227+
for context in retrieved_contexts:
228+
prompt = context_precision_without_reference_prompt(
229+
user_input, context, response
230+
)
231+
result = await self.llm.agenerate(prompt, ContextPrecisionOutput)
232+
verdicts.append(result.verdict)
233+
234+
# Calculate average precision
235+
score = self._calculate_average_precision(verdicts)
236+
return MetricResult(value=float(score))
237+
238+
def _calculate_average_precision(self, verdicts: List[int]) -> float:
239+
"""Calculate average precision from binary verdicts. Matches legacy logic exactly."""
240+
verdict_list = verdicts
241+
denominator = sum(verdict_list) + 1e-10
242+
numerator = sum(
243+
[
244+
(sum(verdict_list[: i + 1]) / (i + 1)) * verdict_list[i]
245+
for i in range(len(verdict_list))
246+
]
247+
)
248+
score = numerator / denominator
249+
250+
if np.isnan(score):
251+
# Match legacy warning behavior
252+
import logging
253+
254+
logging.warning(
255+
"Invalid response format. Expected a list of dictionaries with keys 'verdict'"
256+
)
257+
258+
return score

src/ragas/prompt/metrics/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,17 @@
33
from ragas.prompt.metrics.answer_correctness import correctness_classifier_prompt
44
from ragas.prompt.metrics.answer_relevance import answer_relevancy_prompt
55
from ragas.prompt.metrics.common import nli_statement_prompt, statement_generator_prompt
6+
from ragas.prompt.metrics.context_precision import (
7+
context_precision_prompt,
8+
context_precision_with_reference_prompt,
9+
context_precision_without_reference_prompt,
10+
)
611

712
__all__ = [
813
"answer_relevancy_prompt",
14+
"context_precision_prompt",
15+
"context_precision_with_reference_prompt",
16+
"context_precision_without_reference_prompt",
917
"correctness_classifier_prompt",
1018
"nli_statement_prompt",
1119
"statement_generator_prompt",

0 commit comments

Comments
 (0)