Skip to content

Commit 19fef93

Browse files
committed
Response Groundedness
1 parent ee40879 commit 19fef93

File tree

4 files changed

+456
-0
lines changed

4 files changed

+456
-0
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from ragas.metrics.collections._factual_correctness import FactualCorrectness
1111
from ragas.metrics.collections._faithfulness import Faithfulness
1212
from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
13+
from ragas.metrics.collections._response_groundedness import ResponseGroundedness
1314
from ragas.metrics.collections._rouge_score import RougeScore
1415
from ragas.metrics.collections._semantic_similarity import SemanticSimilarity
1516
from ragas.metrics.collections._string import (
@@ -36,6 +37,7 @@
3637
"Faithfulness",
3738
"NoiseSensitivity",
3839
"NonLLMStringSimilarity",
40+
"ResponseGroundedness",
3941
"RougeScore",
4042
"SemanticSimilarity",
4143
"StringPresence",
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
"""Response Groundedness metric v2 - Modern implementation with dual-judge evaluation."""
2+
3+
import typing as t
4+
from typing import List
5+
6+
import numpy as np
7+
from pydantic import BaseModel
8+
9+
from ragas.metrics.collections.base import BaseMetric
10+
from ragas.metrics.result import MetricResult
11+
from ragas.prompt.metrics.response_groundedness import (
12+
response_groundedness_judge1_prompt,
13+
response_groundedness_judge2_prompt,
14+
)
15+
16+
if t.TYPE_CHECKING:
17+
from ragas.llms.base import InstructorBaseRagasLLM
18+
19+
20+
class GroundednessRating(BaseModel):
21+
"""Structured output for groundedness rating."""
22+
23+
rating: int
24+
25+
26+
class ResponseGroundedness(BaseMetric):
27+
"""
28+
Modern v2 implementation of response groundedness evaluation.
29+
30+
Evaluates how well grounded a response is in the retrieved contexts
31+
using a dual-judge system. This metric averages two distinct judge prompts
32+
to ensure robust evaluation.
33+
34+
The metric uses NVIDIA's proven dual-judge approach:
35+
1. Judge 1: Direct groundedness evaluation with structured instructions
36+
2. Judge 2: Alternative perspective for fairness
37+
3. Average both judges for final score
38+
39+
Rating scale: 0 (not grounded), 1 (partially grounded), 2 (fully grounded)
40+
Final score: Average of both judges converted to 0.0-1.0 scale
41+
42+
Usage:
43+
>>> import instructor
44+
>>> from openai import AsyncOpenAI
45+
>>> from ragas.llms.base import llm_factory
46+
>>> from ragas.metrics.collections import ResponseGroundedness
47+
>>>
48+
>>> # Setup dependencies
49+
>>> client = AsyncOpenAI()
50+
>>> llm = llm_factory("gpt-4o", client=client)
51+
>>>
52+
>>> # Create metric instance
53+
>>> metric = ResponseGroundedness(llm=llm)
54+
>>>
55+
>>> # Single evaluation
56+
>>> result = await metric.ascore(
57+
... response="Einstein was born in Germany in 1879.",
58+
... retrieved_contexts=["Albert Einstein was born in Ulm, Germany on March 14, 1879."]
59+
... )
60+
>>> print(f"Response Groundedness: {result.value}")
61+
62+
Attributes:
63+
llm: Modern instructor-based LLM for dual-judge evaluation
64+
name: The metric name
65+
allowed_values: Score range (0.0 to 1.0, higher is better)
66+
max_retries: Maximum retry attempts for invalid ratings
67+
"""
68+
69+
# Type hints for linter (attributes are set in __init__)
70+
llm: "InstructorBaseRagasLLM"
71+
72+
def __init__(
73+
self,
74+
llm: "InstructorBaseRagasLLM",
75+
name: str = "response_groundedness",
76+
max_retries: int = 5,
77+
**kwargs,
78+
):
79+
"""
80+
Initialize ResponseGroundedness metric with required components.
81+
82+
Args:
83+
llm: Modern instructor-based LLM for dual-judge evaluation
84+
name: The metric name
85+
max_retries: Maximum retry attempts for invalid ratings
86+
"""
87+
# Set attributes explicitly before calling super()
88+
self.llm = llm
89+
self.max_retries = max_retries
90+
91+
# Call super() for validation (without passing llm in kwargs)
92+
super().__init__(name=name, **kwargs)
93+
94+
async def ascore(
95+
self, response: str, retrieved_contexts: List[str]
96+
) -> MetricResult:
97+
"""
98+
Calculate response groundedness score using dual-judge evaluation.
99+
100+
Args:
101+
response: The response to evaluate for groundedness
102+
retrieved_contexts: The retrieved contexts to check groundedness against
103+
104+
Returns:
105+
MetricResult with response groundedness score (0.0-1.0, higher is better)
106+
"""
107+
# Input validation
108+
if not response:
109+
raise ValueError(
110+
"response is missing. Please add response to the test sample."
111+
)
112+
if not retrieved_contexts:
113+
raise ValueError(
114+
"retrieved_contexts is missing. Please add retrieved_contexts to the test sample."
115+
)
116+
117+
# Handle edge cases like legacy
118+
context_str = "\n".join(retrieved_contexts)
119+
120+
if not response.strip() or not context_str.strip():
121+
return MetricResult(value=0.0)
122+
123+
# Get ratings from both judges
124+
judge1_rating = await self._get_judge_rating(
125+
response_groundedness_judge1_prompt(response, context_str)
126+
)
127+
judge2_rating = await self._get_judge_rating(
128+
response_groundedness_judge2_prompt(response, context_str)
129+
)
130+
131+
# Average the scores (convert from 0,1,2 scale to 0.0-1.0)
132+
score = self._average_scores(judge1_rating / 2.0, judge2_rating / 2.0)
133+
134+
return MetricResult(value=float(score))
135+
136+
async def _get_judge_rating(self, prompt: str) -> float:
137+
"""Get rating from judge with retry logic."""
138+
for retry in range(self.max_retries):
139+
try:
140+
result = await self.llm.agenerate(prompt, GroundednessRating)
141+
rating = result.rating
142+
143+
# Validate rating is in expected range
144+
if rating in [0, 1, 2]:
145+
return float(rating)
146+
else:
147+
if retry < self.max_retries - 1:
148+
continue # Retry if invalid rating
149+
else:
150+
return float("nan")
151+
152+
except Exception:
153+
if retry < self.max_retries - 1:
154+
continue # Retry on exception
155+
else:
156+
return float("nan")
157+
158+
return float("nan")
159+
160+
def _average_scores(self, score1: float, score2: float) -> float:
161+
"""Average two judge scores, handling NaN values."""
162+
if not np.isnan(score1) and not np.isnan(score2):
163+
return (score1 + score2) / 2.0
164+
elif not np.isnan(score1):
165+
return score1
166+
elif not np.isnan(score2):
167+
return score2
168+
else:
169+
return float("nan")
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""Response groundedness prompts - V1-identical converted to functions."""
2+
3+
4+
def response_groundedness_judge1_prompt(response: str, context: str) -> str:
5+
"""
6+
V1-identical response groundedness judge 1 prompt - matches template_groundedness1 exactly.
7+
8+
Args:
9+
response: The response/assertion to evaluate for groundedness
10+
context: The context to evaluate the response against
11+
12+
Returns:
13+
V1-identical prompt string for the LLM
14+
"""
15+
return f"""### Instruction
16+
17+
You are a world class expert designed to evaluate the groundedness of an assertion.
18+
You will be provided with an assertion and a context.
19+
Your task is to determine if the assertion is supported by the context.
20+
Follow the instructions below:
21+
A. If there is no context or no assertion or context is empty or assertion is empty, say 0.
22+
B. If the assertion is not supported by the context, say 0.
23+
C. If the assertion is partially supported by the context, say 1.
24+
D. If the assertion is fully supported by the context, say 2.
25+
You must provide a rating of 0, 1, or 2, nothing else.
26+
27+
### Context:
28+
<{context}>
29+
30+
### Assertion:
31+
<{response}>
32+
33+
Analyzing Context and Response, the Groundedness score is """
34+
35+
36+
def response_groundedness_judge2_prompt(response: str, context: str) -> str:
37+
"""
38+
V1-identical response groundedness judge 2 prompt - matches template_groundedness2 exactly.
39+
40+
Args:
41+
response: The response/assertion to evaluate for groundedness
42+
context: The context to evaluate the response against
43+
44+
Returns:
45+
V1-identical prompt string for the LLM
46+
"""
47+
return f"""As a specialist in assessing the strength of connections between statements and their given contexts, I will evaluate the level of support an assertion receives from the provided context. Follow these guidelines:
48+
49+
* If the assertion is not supported or context is empty or assertion is empty, assign a score of 0.
50+
* If the assertion is partially supported, assign a score of 1.
51+
* If the assertion is fully supported, assign a score of 2.
52+
53+
I will provide a rating of 0, 1, or 2, without any additional information.
54+
55+
---
56+
**Context:**
57+
[{context}]
58+
59+
**Assertion:**
60+
[{response}]
61+
62+
Do not explain."""

0 commit comments

Comments
 (0)