Skip to content

Commit dd29e70

Browse files
authored
feat: answer similarity migrated to collections (#2358)
1 parent d536c5d commit dd29e70

File tree

3 files changed

+342
-0
lines changed

3 files changed

+342
-0
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Collections of metrics using modern component architecture."""
22

33
from ragas.metrics.collections._answer_relevancy import AnswerRelevancy
4+
from ragas.metrics.collections._answer_similarity import AnswerSimilarity
45
from ragas.metrics.collections._bleu_score import BleuScore
56
from ragas.metrics.collections._rouge_score import RougeScore
67
from ragas.metrics.collections._string import (
@@ -14,6 +15,7 @@
1415
__all__ = [
1516
"BaseMetric", # Base class
1617
"AnswerRelevancy",
18+
"AnswerSimilarity",
1719
"BleuScore",
1820
"DistanceMeasure",
1921
"ExactMatch",
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
"""Answer Similarity metric."""
2+
3+
import typing as t
4+
5+
import numpy as np
6+
7+
from ragas.metrics.collections.base import BaseMetric
8+
from ragas.metrics.result import MetricResult
9+
10+
if t.TYPE_CHECKING:
11+
from ragas.embeddings.base import BaseRagasEmbedding
12+
13+
14+
class AnswerSimilarity(BaseMetric):
15+
"""
16+
Evaluate semantic similarity between reference and response using embeddings.
17+
18+
Usage:
19+
>>> from openai import AsyncOpenAI
20+
>>> from ragas.embeddings.base import embedding_factory
21+
>>> from ragas.metrics.collections import AnswerSimilarity
22+
>>>
23+
>>> # Setup embeddings
24+
>>> client = AsyncOpenAI()
25+
>>> embeddings = embedding_factory("openai", model="text-embedding-ada-002", client=client, interface="modern")
26+
>>>
27+
>>> # Create metric instance
28+
>>> metric = AnswerSimilarity(embeddings=embeddings)
29+
>>>
30+
>>> # Single evaluation
31+
>>> result = await metric.ascore(
32+
... reference="Paris is the capital of France.",
33+
... response="The capital of France is Paris."
34+
... )
35+
>>> print(f"Score: {result.value}")
36+
>>>
37+
>>> # Batch evaluation
38+
>>> results = await metric.abatch_score([
39+
... {"reference": "Text 1", "response": "Response 1"},
40+
... {"reference": "Text 2", "response": "Response 2"},
41+
... ])
42+
43+
Attributes:
44+
embeddings: Modern embeddings model with embed_text() method
45+
name: The metric name
46+
threshold: Optional threshold for binary classification
47+
allowed_values: Score range (0.0 to 1.0)
48+
"""
49+
50+
embeddings: "BaseRagasEmbedding"
51+
52+
def __init__(
53+
self,
54+
embeddings: "BaseRagasEmbedding",
55+
name: str = "answer_similarity",
56+
threshold: t.Optional[float] = None,
57+
**kwargs,
58+
):
59+
"""Initialize AnswerSimilarity metric with required embeddings."""
60+
self.embeddings = embeddings
61+
self.threshold = threshold
62+
63+
super().__init__(name=name, **kwargs)
64+
65+
async def ascore(self, reference: str, response: str) -> MetricResult:
66+
"""
67+
Calculate semantic similarity score asynchronously.
68+
69+
Components are guaranteed to be validated and non-None by the base class.
70+
71+
Args:
72+
reference: The reference/ground truth text
73+
response: The response text to evaluate
74+
75+
Returns:
76+
MetricResult with similarity score (0.0-1.0)
77+
"""
78+
reference = reference or " "
79+
response = response or " "
80+
81+
embedding_1 = np.array(self.embeddings.embed_text(reference))
82+
embedding_2 = np.array(self.embeddings.embed_text(response))
83+
84+
norms_1 = np.linalg.norm(embedding_1, keepdims=True)
85+
norms_2 = np.linalg.norm(embedding_2, keepdims=True)
86+
embedding_1_normalized = embedding_1 / norms_1
87+
embedding_2_normalized = embedding_2 / norms_2
88+
similarity = embedding_1_normalized @ embedding_2_normalized.T
89+
score = similarity.flatten()
90+
91+
assert isinstance(score, np.ndarray), "Expects ndarray"
92+
if self.threshold:
93+
score = score >= self.threshold
94+
95+
return MetricResult(value=float(score.item()))
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
"""E2E tests for Answer Similarity metric migration from v1 to v2 (class-based)."""
2+
3+
import pytest
4+
5+
from ragas.dataset_schema import SingleTurnSample
6+
from ragas.metrics import AnswerSimilarity as LegacyAnswerSimilarity, MetricResult
7+
from ragas.metrics.collections import AnswerSimilarity
8+
9+
10+
class TestAnswerSimilarityE2EMigration:
11+
"""E2E test compatibility between legacy AnswerSimilarity and new V2 AnswerSimilarity with automatic validation."""
12+
13+
@pytest.fixture
14+
def sample_data(self):
15+
"""Real-world test cases for answer similarity evaluation."""
16+
return [
17+
{
18+
"reference": "Paris is the capital of France.",
19+
"response": "The capital of France is Paris.",
20+
"description": "Semantically similar with word reordering",
21+
},
22+
{
23+
"reference": "Python is a high-level programming language known for its simplicity and readability.",
24+
"response": "Python is a programming language that emphasizes code readability.",
25+
"description": "Similar content with paraphrasing",
26+
},
27+
{
28+
"reference": "Machine learning is a subset of artificial intelligence.",
29+
"response": "Deep learning uses neural networks with multiple layers.",
30+
"description": "Related but different concepts",
31+
},
32+
{
33+
"reference": "The quick brown fox jumps over the lazy dog.",
34+
"response": "A slow red cat walks under the active mouse.",
35+
"description": "Different content with similar structure",
36+
},
37+
{
38+
"reference": "",
39+
"response": "Some response text",
40+
"description": "Empty reference",
41+
},
42+
]
43+
44+
@pytest.fixture
45+
def test_legacy_embeddings(self):
46+
"""Create legacy embeddings for legacy implementation."""
47+
try:
48+
from ragas.embeddings.base import embedding_factory
49+
50+
return embedding_factory("text-embedding-ada-002")
51+
except ImportError as e:
52+
pytest.skip(f"Embedding factory not available: {e}")
53+
except Exception as e:
54+
pytest.skip(
55+
f"Could not create legacy embeddings (API key may be missing): {e}"
56+
)
57+
58+
@pytest.fixture
59+
def test_modern_embeddings(self):
60+
"""Create modern embeddings for v2 implementation."""
61+
try:
62+
import openai
63+
64+
from ragas.embeddings.base import embedding_factory
65+
66+
client = openai.AsyncOpenAI()
67+
68+
return embedding_factory(
69+
provider="openai",
70+
model="text-embedding-ada-002",
71+
client=client,
72+
interface="modern",
73+
)
74+
except ImportError as e:
75+
pytest.skip(f"OpenAI or embedding factory not available: {e}")
76+
except Exception as e:
77+
pytest.skip(
78+
f"Could not create modern embeddings (API key may be missing): {e}"
79+
)
80+
81+
@pytest.mark.asyncio
82+
async def test_legacy_answer_similarity_vs_v2_answer_similarity_e2e_compatibility(
83+
self,
84+
sample_data,
85+
test_legacy_embeddings,
86+
test_modern_embeddings,
87+
):
88+
"""E2E test that legacy and v2 implementations produce identical scores with real embeddings."""
89+
90+
if test_legacy_embeddings is None or test_modern_embeddings is None:
91+
pytest.skip("Embeddings required for E2E testing")
92+
93+
for i, data in enumerate(sample_data):
94+
print(
95+
f"\n🧪 Testing Answer Similarity - Case {i + 1}: {data['description']}"
96+
)
97+
print(f" Reference: {data['reference'][:50]}...")
98+
print(f" Response: {data['response'][:50]}...")
99+
100+
legacy_answer_similarity = LegacyAnswerSimilarity(
101+
embeddings=test_legacy_embeddings
102+
)
103+
legacy_sample = SingleTurnSample(
104+
user_input="dummy",
105+
response=data["response"],
106+
reference=data["reference"],
107+
)
108+
legacy_score = await legacy_answer_similarity._single_turn_ascore(
109+
legacy_sample, None
110+
)
111+
112+
v2_answer_similarity = AnswerSimilarity(embeddings=test_modern_embeddings)
113+
v2_answer_similarity_result = await v2_answer_similarity.ascore(
114+
reference=data["reference"],
115+
response=data["response"],
116+
)
117+
118+
score_diff = abs(legacy_score - v2_answer_similarity_result.value)
119+
print(f" Legacy: {legacy_score:.6f}")
120+
print(f" V2 Class: {v2_answer_similarity_result.value:.6f}")
121+
print(f" Diff: {score_diff:.10f}")
122+
123+
assert score_diff < 1e-6, (
124+
f"Case {i + 1} ({data['description']}): Mismatch: {legacy_score} vs {v2_answer_similarity_result.value}"
125+
)
126+
127+
assert isinstance(legacy_score, float)
128+
assert isinstance(v2_answer_similarity_result, MetricResult)
129+
assert 0.0 <= legacy_score <= 1.0
130+
assert 0.0 <= v2_answer_similarity_result.value <= 1.0
131+
132+
print(" ✅ Scores match!")
133+
134+
@pytest.mark.asyncio
135+
async def test_answer_similarity_with_threshold(
136+
self, test_legacy_embeddings, test_modern_embeddings
137+
):
138+
"""Test that both implementations correctly handle threshold parameter."""
139+
140+
if test_legacy_embeddings is None or test_modern_embeddings is None:
141+
pytest.skip("Embeddings required for E2E testing")
142+
143+
test_cases = [
144+
{
145+
"reference": "Paris is the capital of France.",
146+
"response": "The capital of France is Paris.",
147+
"threshold": 0.9,
148+
"description": "High similarity with high threshold",
149+
},
150+
{
151+
"reference": "Machine learning is a subset of artificial intelligence.",
152+
"response": "Deep learning uses neural networks.",
153+
"threshold": 0.5,
154+
"description": "Different content with medium threshold",
155+
},
156+
]
157+
158+
for case in test_cases:
159+
print(f"\n🎯 Testing threshold: {case['description']}")
160+
161+
legacy_answer_similarity = LegacyAnswerSimilarity(
162+
embeddings=test_legacy_embeddings, threshold=case["threshold"]
163+
)
164+
legacy_sample = SingleTurnSample(
165+
user_input="dummy",
166+
response=case["response"],
167+
reference=case["reference"],
168+
)
169+
legacy_score = await legacy_answer_similarity._single_turn_ascore(
170+
legacy_sample, None
171+
)
172+
173+
v2_answer_similarity = AnswerSimilarity(
174+
embeddings=test_modern_embeddings, threshold=case["threshold"]
175+
)
176+
v2_result = await v2_answer_similarity.ascore(
177+
reference=case["reference"],
178+
response=case["response"],
179+
)
180+
181+
print(f" Reference: {case['reference']}")
182+
print(f" Response: {case['response']}")
183+
print(f" Threshold: {case['threshold']}")
184+
print(f" Legacy: {legacy_score:.6f}")
185+
print(f" V2 Class: {v2_result.value:.6f}")
186+
187+
score_diff = abs(legacy_score - v2_result.value)
188+
assert score_diff < 1e-6, (
189+
f"Threshold test failed: {legacy_score} vs {v2_result.value}"
190+
)
191+
192+
assert legacy_score in [0.0, 1.0]
193+
assert v2_result.value in [0.0, 1.0]
194+
195+
print(" ✅ Threshold handling matches!")
196+
197+
@pytest.mark.asyncio
198+
async def test_v2_class_batch_processing(self, sample_data, test_modern_embeddings):
199+
"""Test V2 class-based AnswerSimilarity batch processing."""
200+
201+
if test_modern_embeddings is None:
202+
pytest.skip("Modern embeddings required for V2 testing")
203+
204+
metric = AnswerSimilarity(embeddings=test_modern_embeddings)
205+
206+
batch_inputs = [
207+
{"reference": case["reference"], "response": case["response"]}
208+
for case in sample_data[:3]
209+
]
210+
211+
print(f"\n📦 Testing V2 class batch processing with {len(batch_inputs)} items:")
212+
213+
results = await metric.abatch_score(batch_inputs)
214+
215+
assert len(results) == len(batch_inputs)
216+
217+
for i, (case, result) in enumerate(zip(sample_data[:3], results)):
218+
print(f" Case {i + 1}: {result.value:.6f} - {case['description']}")
219+
assert isinstance(result.value, float)
220+
assert 0.0 <= result.value <= 1.0
221+
assert result.reason is None
222+
223+
print(" ✅ V2 class batch processing works correctly!")
224+
225+
def test_answer_similarity_migration_requirements_documented(self):
226+
"""Document the requirements for running full E2E answer similarity tests."""
227+
228+
requirements = {
229+
"embeddings": "OpenAI embeddings, HuggingFace embeddings, or similar",
230+
"environment": "API keys configured for embedding providers",
231+
"purpose": "Verify that v2 class-based implementation produces identical results to legacy implementation",
232+
}
233+
234+
print("\n📋 Answer Similarity E2E Test Requirements:")
235+
for key, value in requirements.items():
236+
print(f" {key.capitalize()}: {value}")
237+
238+
print("\n🚀 To enable full E2E testing:")
239+
print(" 1. Configure embedding provider (e.g., export OPENAI_API_KEY=...)")
240+
print(" 2. Remove @pytest.mark.skip decorators")
241+
print(
242+
" 3. Run: pytest tests/e2e/metrics_migration/test_answer_similarity_migration.py -v -s"
243+
)
244+
245+
assert True

0 commit comments

Comments
 (0)