Skip to content

Commit 3522018

Browse files
committed
feat: migrate NonLLMContextRecall and IDBasedContextRecall to collections module
1 parent 8b4653c commit 3522018

File tree

2 files changed

+183
-1
lines changed

2 files changed

+183
-1
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,11 @@
1111
ContextPrecisionWithReference,
1212
ContextUtilization,
1313
)
14-
from ragas.metrics.collections._context_recall import ContextRecall
14+
from ragas.metrics.collections._context_recall import (
15+
ContextRecall,
16+
IDBasedContextRecall,
17+
NonLLMContextRecall,
18+
)
1519
from ragas.metrics.collections._context_relevance import ContextRelevance
1620
from ragas.metrics.collections._factual_correctness import FactualCorrectness
1721
from ragas.metrics.collections._faithfulness import Faithfulness
@@ -45,7 +49,9 @@
4549
"ExactMatch",
4650
"FactualCorrectness",
4751
"Faithfulness",
52+
"IDBasedContextRecall",
4853
"NoiseSensitivity",
54+
"NonLLMContextRecall",
4955
"NonLLMStringSimilarity",
5056
"ResponseGroundedness",
5157
"RougeScore",

src/ragas/metrics/collections/_context_recall.py

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,23 @@
11
"""Context Recall metric v2 - Class-based implementation with modern components."""
22

3+
import logging
34
import typing as t
5+
from typing import List
46

57
import numpy as np
68
from pydantic import BaseModel
79

10+
from ragas.dataset_schema import SingleTurnSample
11+
from ragas.metrics._string import DistanceMeasure, NonLLMStringSimilarity
812
from ragas.metrics.collections.base import BaseMetric
913
from ragas.metrics.result import MetricResult
1014
from ragas.prompt.metrics.context_recall import context_recall_prompt
1115

1216
if t.TYPE_CHECKING:
1317
from ragas.llms.base import InstructorBaseRagasLLM
1418

19+
logger = logging.getLogger(__name__)
20+
1521

1622
class ContextRecallClassification(BaseModel):
1723
"""Structured output for a single statement classification."""
@@ -137,3 +143,173 @@ async def ascore(
137143
score = sum(attributions) / len(attributions) if attributions else np.nan
138144

139145
return MetricResult(value=float(score))
146+
147+
148+
class NonLLMContextRecall(BaseMetric):
149+
"""
150+
Evaluate context recall using string similarity without LLM.
151+
152+
Compares retrieved contexts with reference contexts using string similarity metrics.
153+
A reference context is considered recalled if it has sufficient similarity with
154+
at least one retrieved context.
155+
156+
This implementation provides deterministic evaluation without requiring LLM components.
157+
158+
Usage:
159+
>>> from ragas.metrics.collections import NonLLMContextRecall
160+
>>>
161+
>>> metric = NonLLMContextRecall(threshold=0.5)
162+
>>>
163+
>>> result = await metric.ascore(
164+
... retrieved_contexts=["Albert Einstein was a physicist"],
165+
... reference_contexts=["Einstein was a theoretical physicist"]
166+
... )
167+
>>> print(f"Context Recall: {result.value}")
168+
169+
Attributes:
170+
name: The metric name
171+
threshold: Similarity threshold for considering a context as recalled (default: 0.5)
172+
distance_measure: The string distance measure to use (default: LEVENSHTEIN)
173+
allowed_values: Score range (0.0 to 1.0)
174+
"""
175+
176+
def __init__(
177+
self,
178+
name: str = "non_llm_context_recall",
179+
threshold: float = 0.5,
180+
distance_measure: DistanceMeasure = DistanceMeasure.LEVENSHTEIN,
181+
**kwargs,
182+
):
183+
"""
184+
Initialize NonLLMContextRecall metric.
185+
186+
Args:
187+
name: The metric name
188+
threshold: Similarity threshold (0.0-1.0) for considering a context recalled
189+
distance_measure: The string distance measure to use
190+
**kwargs: Additional arguments passed to BaseMetric
191+
"""
192+
super().__init__(name=name, **kwargs)
193+
self.threshold = threshold
194+
self._distance_measure = NonLLMStringSimilarity(
195+
distance_measure=distance_measure
196+
)
197+
198+
async def ascore(
199+
self,
200+
retrieved_contexts: List[str],
201+
reference_contexts: List[str],
202+
) -> MetricResult:
203+
"""
204+
Calculate context recall score using string similarity.
205+
206+
Args:
207+
retrieved_contexts: List of retrieved context strings
208+
reference_contexts: List of reference context strings
209+
210+
Returns:
211+
MetricResult with recall score (0.0-1.0, higher is better)
212+
"""
213+
if not retrieved_contexts:
214+
raise ValueError("retrieved_contexts cannot be empty")
215+
if not reference_contexts:
216+
raise ValueError("reference_contexts cannot be empty")
217+
218+
scores = []
219+
for ref in reference_contexts:
220+
max_similarity = 0.0
221+
for rc in retrieved_contexts:
222+
# Use the distance measure to compute similarity
223+
similarity = await self._distance_measure.single_turn_ascore(
224+
SingleTurnSample(reference=rc, response=ref),
225+
callbacks=None,
226+
)
227+
max_similarity = max(max_similarity, similarity)
228+
scores.append(max_similarity)
229+
230+
# Compute recall: proportion of reference contexts above threshold
231+
recalled = [1 if score > self.threshold else 0 for score in scores]
232+
score = sum(recalled) / len(recalled) if recalled else np.nan
233+
234+
return MetricResult(value=float(score))
235+
236+
237+
class IDBasedContextRecall(BaseMetric):
238+
"""
239+
Evaluate context recall by comparing retrieved and reference context IDs.
240+
241+
Directly compares retrieved context IDs with reference context IDs.
242+
The score represents the proportion of reference IDs that were successfully retrieved.
243+
244+
This implementation works with both string and integer IDs and provides
245+
deterministic evaluation without requiring LLM components.
246+
247+
Usage:
248+
>>> from ragas.metrics.collections import IDBasedContextRecall
249+
>>>
250+
>>> metric = IDBasedContextRecall()
251+
>>>
252+
>>> result = await metric.ascore(
253+
... retrieved_context_ids=["doc1", "doc2", "doc3"],
254+
... reference_context_ids=["doc1", "doc2", "doc4"]
255+
... )
256+
>>> print(f"Context Recall: {result.value}") # 0.667
257+
258+
Attributes:
259+
name: The metric name
260+
allowed_values: Score range (0.0 to 1.0)
261+
"""
262+
263+
def __init__(
264+
self,
265+
name: str = "id_based_context_recall",
266+
**kwargs,
267+
):
268+
"""
269+
Initialize IDBasedContextRecall metric.
270+
271+
Args:
272+
name: The metric name
273+
**kwargs: Additional arguments passed to BaseMetric
274+
"""
275+
super().__init__(name=name, **kwargs)
276+
277+
async def ascore(
278+
self,
279+
retrieved_context_ids: t.Union[t.List[str], t.List[int]],
280+
reference_context_ids: t.Union[t.List[str], t.List[int]],
281+
) -> MetricResult:
282+
"""
283+
Calculate context recall score based on ID matching.
284+
285+
Args:
286+
retrieved_context_ids: List of retrieved context IDs (strings or integers)
287+
reference_context_ids: List of reference context IDs (strings or integers)
288+
289+
Returns:
290+
MetricResult with recall score (0.0-1.0, higher is better)
291+
"""
292+
if not retrieved_context_ids:
293+
raise ValueError("retrieved_context_ids cannot be empty")
294+
if not reference_context_ids:
295+
raise ValueError("reference_context_ids cannot be empty")
296+
297+
# Convert all IDs to strings for consistent comparison
298+
retrieved_ids_set = set(str(id_) for id_ in retrieved_context_ids)
299+
reference_ids_set = set(str(id_) for id_ in reference_context_ids)
300+
301+
# Calculate how many reference IDs appear in retrieved IDs
302+
hits = sum(
303+
1 for ref_id in reference_ids_set if str(ref_id) in retrieved_ids_set
304+
)
305+
306+
# Calculate recall score
307+
total_refs = len(reference_ids_set)
308+
score = hits / total_refs if total_refs > 0 else np.nan
309+
310+
if np.isnan(score):
311+
logger.warning(
312+
"No reference context IDs provided, cannot calculate recall."
313+
)
314+
315+
return MetricResult(value=float(score))

0 commit comments

Comments
 (0)