Skip to content

Commit 57605dd

Browse files
authored
Migrate Answer Accuracy + Context Relevance (#2390)
1 parent c59ba02 commit 57605dd

File tree

7 files changed

+925
-0
lines changed

7 files changed

+925
-0
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Collections of metrics using modern component architecture."""
22

3+
from ragas.metrics.collections._answer_accuracy import AnswerAccuracy
34
from ragas.metrics.collections._answer_correctness import AnswerCorrectness
45
from ragas.metrics.collections._answer_relevancy import AnswerRelevancy
56
from ragas.metrics.collections._answer_similarity import AnswerSimilarity
@@ -13,6 +14,7 @@
1314
)
1415
from ragas.metrics.collections._bleu_score import BleuScore
1516
from ragas.metrics.collections._context_entity_recall import ContextEntityRecall
17+
from ragas.metrics.collections._context_relevance import ContextRelevance
1618
from ragas.metrics.collections._faithfulness import Faithfulness
1719
from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
1820
from ragas.metrics.collections._rouge_score import RougeScore
@@ -29,12 +31,14 @@
2931

3032
__all__ = [
3133
"BaseMetric", # Base class
34+
"AnswerAccuracy",
3235
"AnswerCorrectness",
3336
"AnswerRelevancy",
3437
"AnswerSimilarity",
3538
"AspectCritic",
3639
"BleuScore",
3740
"ContextEntityRecall",
41+
"ContextRelevance",
3842
"DistanceMeasure",
3943
"ExactMatch",
4044
"Faithfulness",
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
"""Answer Accuracy metric v2 - Modern implementation with dual-judge evaluation."""
2+
3+
import typing as t
4+
5+
import numpy as np
6+
from pydantic import BaseModel
7+
8+
from ragas.metrics.collections.base import BaseMetric
9+
from ragas.metrics.result import MetricResult
10+
from ragas.prompt.metrics.answer_accuracy import (
11+
answer_accuracy_judge1_prompt,
12+
answer_accuracy_judge2_prompt,
13+
)
14+
15+
if t.TYPE_CHECKING:
16+
from ragas.llms.base import InstructorBaseRagasLLM
17+
18+
19+
class JudgeRating(BaseModel):
20+
"""Structured output for judge rating."""
21+
22+
rating: int
23+
24+
25+
class AnswerAccuracy(BaseMetric):
26+
"""
27+
Modern v2 implementation of answer accuracy evaluation.
28+
29+
Measures answer accuracy compared to ground truth using a dual-judge system.
30+
This metric averages two distinct judge prompts to ensure robust evaluation.
31+
32+
The metric uses NVIDIA's proven dual-judge approach:
33+
1. Judge 1: Direct User Answer vs Reference Answer comparison
34+
2. Judge 2: Swapped perspective for fairness
35+
3. Average both judges for final score
36+
37+
Rating scale: 0 (no match), 2 (partial match), 4 (exact match)
38+
Final score: Average of both judges converted to 0.0-1.0 scale
39+
40+
Usage:
41+
>>> import instructor
42+
>>> from openai import AsyncOpenAI
43+
>>> from ragas.llms.base import instructor_llm_factory
44+
>>> from ragas.metrics.collections import AnswerAccuracy
45+
>>>
46+
>>> # Setup dependencies
47+
>>> client = AsyncOpenAI()
48+
>>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o")
49+
>>>
50+
>>> # Create metric instance
51+
>>> metric = AnswerAccuracy(llm=llm)
52+
>>>
53+
>>> # Single evaluation
54+
>>> result = await metric.ascore(
55+
... user_input="When was Einstein born?",
56+
... response="Albert Einstein was born in 1879.",
57+
... reference="Albert Einstein was born in 1879."
58+
... )
59+
>>> print(f"Answer Accuracy: {result.value}")
60+
61+
Attributes:
62+
llm: Modern instructor-based LLM for dual-judge evaluation
63+
name: The metric name
64+
allowed_values: Score range (0.0 to 1.0, higher is better)
65+
max_retries: Maximum retry attempts for invalid ratings
66+
"""
67+
68+
# Type hints for linter (attributes are set in __init__)
69+
llm: "InstructorBaseRagasLLM"
70+
71+
def __init__(
72+
self,
73+
llm: "InstructorBaseRagasLLM",
74+
name: str = "answer_accuracy",
75+
max_retries: int = 5,
76+
**kwargs,
77+
):
78+
"""
79+
Initialize AnswerAccuracy metric with required components.
80+
81+
Args:
82+
llm: Modern instructor-based LLM for dual-judge evaluation
83+
name: The metric name
84+
max_retries: Maximum retry attempts for invalid ratings
85+
"""
86+
# Set attributes explicitly before calling super()
87+
self.llm = llm
88+
self.max_retries = max_retries
89+
90+
# Call super() for validation (without passing llm in kwargs)
91+
super().__init__(name=name, **kwargs)
92+
93+
async def ascore(
94+
self, user_input: str, response: str, reference: str
95+
) -> MetricResult:
96+
"""
97+
Calculate answer accuracy score using dual-judge evaluation.
98+
99+
Args:
100+
user_input: The original question
101+
response: The user's answer to evaluate
102+
reference: The ground truth reference answer
103+
104+
Returns:
105+
MetricResult with answer accuracy score (0.0-1.0, higher is better)
106+
"""
107+
# Input validation
108+
if not user_input:
109+
raise ValueError(
110+
"user_input is missing. Please add user_input to the test sample."
111+
)
112+
if not response:
113+
raise ValueError(
114+
"response is missing. Please add response to the test sample."
115+
)
116+
if not reference:
117+
raise ValueError(
118+
"reference is missing. Please add reference to the test sample."
119+
)
120+
121+
# Get ratings from both judges with NVIDIA temperature (0.1)
122+
judge1_rating = await self._get_judge_rating(
123+
answer_accuracy_judge1_prompt(user_input, response, reference)
124+
)
125+
judge2_rating = await self._get_judge_rating(
126+
answer_accuracy_judge2_prompt(
127+
user_input, reference, response
128+
) # Note: swapped order
129+
)
130+
131+
# Average the scores (convert from 0,2,4 scale to 0.0-1.0)
132+
score = self._average_scores(judge1_rating / 4.0, judge2_rating / 4.0)
133+
134+
return MetricResult(value=float(score))
135+
136+
async def _get_judge_rating(self, prompt: str) -> float:
137+
"""Get rating from judge using structured JSON output."""
138+
for retry in range(self.max_retries):
139+
try:
140+
# Use structured output with JSON - clean and reliable
141+
result = await self.llm.agenerate(prompt, JudgeRating)
142+
rating = result.rating
143+
144+
# Validate rating is in expected range
145+
if rating in [0, 2, 4]:
146+
return float(rating)
147+
else:
148+
# Invalid rating - retry or return NaN
149+
if retry < self.max_retries - 1:
150+
continue # Retry if invalid rating
151+
else:
152+
return float("nan")
153+
154+
except Exception:
155+
if retry < self.max_retries - 1:
156+
continue # Retry on exception
157+
else:
158+
return float("nan")
159+
160+
return float("nan")
161+
162+
def _average_scores(self, score1: float, score2: float) -> float:
163+
"""Average two judge scores, handling NaN values."""
164+
if not np.isnan(score1) and not np.isnan(score2):
165+
return (score1 + score2) / 2.0
166+
elif not np.isnan(score1):
167+
return score1
168+
elif not np.isnan(score2):
169+
return score2
170+
else:
171+
return float("nan")
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
"""Context Relevance metric v2 - Modern implementation with dual-judge evaluation."""
2+
3+
import typing as t
4+
from typing import List
5+
6+
import numpy as np
7+
from pydantic import BaseModel
8+
9+
from ragas.metrics.collections.base import BaseMetric
10+
from ragas.metrics.result import MetricResult
11+
from ragas.prompt.metrics.context_relevance import (
12+
context_relevance_judge1_prompt,
13+
context_relevance_judge2_prompt,
14+
)
15+
16+
if t.TYPE_CHECKING:
17+
from ragas.llms.base import InstructorBaseRagasLLM
18+
19+
20+
class RelevanceRating(BaseModel):
21+
"""Structured output for relevance rating."""
22+
23+
rating: int
24+
25+
26+
class ContextRelevance(BaseMetric):
27+
"""
28+
Modern v2 implementation of context relevance evaluation.
29+
30+
Evaluates whether the retrieved contexts are pertinent to the user input
31+
using a dual-judge system. This metric averages two distinct judge prompts
32+
to ensure robust evaluation.
33+
34+
The metric uses NVIDIA's proven dual-judge approach:
35+
1. Judge 1: Direct context relevance evaluation
36+
2. Judge 2: Alternative perspective for fairness
37+
3. Average both judges for final score
38+
39+
Rating scale: 0 (not relevant), 1 (partially relevant), 2 (fully relevant)
40+
Final score: Average of both judges converted to 0.0-1.0 scale
41+
42+
Usage:
43+
>>> import instructor
44+
>>> from openai import AsyncOpenAI
45+
>>> from ragas.llms.base import instructor_llm_factory
46+
>>> from ragas.metrics.collections import ContextRelevance
47+
>>>
48+
>>> # Setup dependencies
49+
>>> client = AsyncOpenAI()
50+
>>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o")
51+
>>>
52+
>>> # Create metric instance
53+
>>> metric = ContextRelevance(llm=llm)
54+
>>>
55+
>>> # Single evaluation
56+
>>> result = await metric.ascore(
57+
... user_input="When was Einstein born?",
58+
... retrieved_contexts=["Albert Einstein was born March 14, 1879."]
59+
... )
60+
>>> print(f"Context Relevance: {result.value}")
61+
62+
Attributes:
63+
llm: Modern instructor-based LLM for dual-judge evaluation
64+
name: The metric name
65+
allowed_values: Score range (0.0 to 1.0, higher is better)
66+
max_retries: Maximum retry attempts for invalid ratings
67+
"""
68+
69+
# Type hints for linter (attributes are set in __init__)
70+
llm: "InstructorBaseRagasLLM"
71+
72+
def __init__(
73+
self,
74+
llm: "InstructorBaseRagasLLM",
75+
name: str = "context_relevance",
76+
max_retries: int = 5,
77+
**kwargs,
78+
):
79+
"""
80+
Initialize ContextRelevance metric with required components.
81+
82+
Args:
83+
llm: Modern instructor-based LLM for dual-judge evaluation
84+
name: The metric name
85+
max_retries: Maximum retry attempts for invalid ratings
86+
"""
87+
# Set attributes explicitly before calling super()
88+
self.llm = llm
89+
self.max_retries = max_retries
90+
91+
# Call super() for validation (without passing llm in kwargs)
92+
super().__init__(name=name, **kwargs)
93+
94+
async def ascore(
95+
self, user_input: str, retrieved_contexts: List[str]
96+
) -> MetricResult:
97+
"""
98+
Calculate context relevance score using dual-judge evaluation.
99+
100+
Args:
101+
user_input: The original question
102+
retrieved_contexts: The retrieved contexts to evaluate for relevance
103+
104+
Returns:
105+
MetricResult with context relevance score (0.0-1.0, higher is better)
106+
"""
107+
# Input validation
108+
if not user_input:
109+
raise ValueError(
110+
"user_input is missing. Please add user_input to the test sample."
111+
)
112+
if not retrieved_contexts:
113+
raise ValueError(
114+
"retrieved_contexts is missing. Please add retrieved_contexts to the test sample."
115+
)
116+
117+
# Handle edge cases like legacy
118+
context_str = "\n".join(retrieved_contexts)
119+
120+
if not user_input.strip() or not context_str.strip():
121+
return MetricResult(value=0.0)
122+
123+
# Edge case: if user input matches context exactly
124+
if user_input.strip() == context_str.strip():
125+
return MetricResult(value=0.0)
126+
127+
# Edge case: if context is contained in user input
128+
if context_str.strip() in user_input.strip():
129+
return MetricResult(value=0.0)
130+
131+
# Get ratings from both judges with NVIDIA temperature (0.1)
132+
judge1_rating = await self._get_judge_rating(
133+
context_relevance_judge1_prompt(user_input, context_str)
134+
)
135+
judge2_rating = await self._get_judge_rating(
136+
context_relevance_judge2_prompt(user_input, context_str)
137+
)
138+
139+
# Average the scores (convert from 0,1,2 scale to 0.0-1.0)
140+
score = self._average_scores(judge1_rating / 2.0, judge2_rating / 2.0)
141+
142+
return MetricResult(value=float(score))
143+
144+
async def _get_judge_rating(self, prompt: str) -> float:
145+
"""Get rating from judge with retry logic and NVIDIA temperature."""
146+
for retry in range(self.max_retries):
147+
try:
148+
result = await self.llm.agenerate(prompt, RelevanceRating)
149+
rating = result.rating
150+
151+
# Validate rating is in expected range
152+
if rating in [0, 1, 2]:
153+
return float(rating)
154+
else:
155+
if retry < self.max_retries - 1:
156+
continue # Retry if invalid rating
157+
else:
158+
return float("nan")
159+
160+
except Exception:
161+
if retry < self.max_retries - 1:
162+
continue # Retry on exception
163+
else:
164+
return float("nan")
165+
166+
return float("nan")
167+
168+
def _average_scores(self, score1: float, score2: float) -> float:
169+
"""Average two judge scores, handling NaN values."""
170+
if not np.isnan(score1) and not np.isnan(score2):
171+
return (score1 + score2) / 2.0
172+
elif not np.isnan(score1):
173+
return score1
174+
elif not np.isnan(score2):
175+
return score2
176+
else:
177+
return float("nan")

0 commit comments

Comments
 (0)