Skip to content

Commit b89d033

Browse files
authored
feat: semantic similarity migrated to collections (#2361)
1 parent f31c365 commit b89d033

File tree

3 files changed

+351
-0
lines changed

3 files changed

+351
-0
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from ragas.metrics.collections._answer_similarity import AnswerSimilarity
55
from ragas.metrics.collections._bleu_score import BleuScore
66
from ragas.metrics.collections._rouge_score import RougeScore
7+
from ragas.metrics.collections._semantic_similarity import SemanticSimilarity
78
from ragas.metrics.collections._string import (
89
DistanceMeasure,
910
ExactMatch,
@@ -21,5 +22,6 @@
2122
"ExactMatch",
2223
"NonLLMStringSimilarity",
2324
"RougeScore",
25+
"SemanticSimilarity",
2426
"StringPresence",
2527
]
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
"""Semantic Similarity metric."""
2+
3+
import typing as t
4+
5+
import numpy as np
6+
7+
from ragas.metrics.collections.base import BaseMetric
8+
from ragas.metrics.result import MetricResult
9+
10+
if t.TYPE_CHECKING:
11+
from ragas.embeddings.base import BaseRagasEmbedding
12+
13+
14+
class SemanticSimilarity(BaseMetric):
15+
"""
16+
Evaluate semantic similarity between reference and response using embeddings.
17+
18+
Scores the semantic similarity of ground truth with generated answer using
19+
cosine similarity of embeddings. Based on the SAS paper:
20+
https://arxiv.org/pdf/2108.06130.pdf
21+
22+
Usage:
23+
>>> from openai import AsyncOpenAI
24+
>>> from ragas.embeddings.base import embedding_factory
25+
>>> from ragas.metrics.collections import SemanticSimilarity
26+
>>>
27+
>>> # Setup embeddings
28+
>>> client = AsyncOpenAI()
29+
>>> embeddings = embedding_factory("openai", model="text-embedding-ada-002", client=client, interface="modern")
30+
>>>
31+
>>> # Create metric instance
32+
>>> metric = SemanticSimilarity(embeddings=embeddings)
33+
>>>
34+
>>> # Single evaluation
35+
>>> result = await metric.ascore(
36+
... reference="Paris is the capital of France.",
37+
... response="The capital of France is Paris."
38+
... )
39+
>>> print(f"Score: {result.value}")
40+
>>>
41+
>>> # Batch evaluation
42+
>>> results = await metric.abatch_score([
43+
... {"reference": "Text 1", "response": "Response 1"},
44+
... {"reference": "Text 2", "response": "Response 2"},
45+
... ])
46+
47+
Attributes:
48+
embeddings: Modern embeddings model with embed_text() method
49+
name: The metric name
50+
threshold: Optional threshold for binary classification
51+
allowed_values: Score range (0.0 to 1.0)
52+
"""
53+
54+
embeddings: "BaseRagasEmbedding"
55+
56+
def __init__(
57+
self,
58+
embeddings: "BaseRagasEmbedding",
59+
name: str = "semantic_similarity",
60+
threshold: t.Optional[float] = None,
61+
**kwargs,
62+
):
63+
"""Initialize SemanticSimilarity metric with required embeddings."""
64+
self.embeddings = embeddings
65+
self.threshold = threshold
66+
67+
super().__init__(name=name, **kwargs)
68+
69+
async def ascore(self, reference: str, response: str) -> MetricResult:
70+
"""
71+
Calculate semantic similarity score asynchronously.
72+
73+
Components are guaranteed to be validated and non-None by the base class.
74+
75+
Args:
76+
reference: The reference/ground truth text
77+
response: The response text to evaluate
78+
79+
Returns:
80+
MetricResult with similarity score (0.0-1.0)
81+
"""
82+
reference = reference or " "
83+
response = response or " "
84+
85+
embedding_1 = np.array(self.embeddings.embed_text(reference))
86+
embedding_2 = np.array(self.embeddings.embed_text(response))
87+
88+
norms_1 = np.linalg.norm(embedding_1, keepdims=True)
89+
norms_2 = np.linalg.norm(embedding_2, keepdims=True)
90+
embedding_1_normalized = embedding_1 / norms_1
91+
embedding_2_normalized = embedding_2 / norms_2
92+
similarity = embedding_1_normalized @ embedding_2_normalized.T
93+
score = similarity.flatten()
94+
95+
assert isinstance(score, np.ndarray), "Expects ndarray"
96+
if self.threshold:
97+
score = score >= self.threshold
98+
99+
return MetricResult(value=float(score.item()))
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
"""E2E tests for Semantic Similarity metric migration from v1 to v2."""
2+
3+
import pytest
4+
5+
from ragas.dataset_schema import SingleTurnSample
6+
from ragas.metrics import MetricResult
7+
from ragas.metrics._answer_similarity import (
8+
SemanticSimilarity as LegacySemanticSimilarity,
9+
)
10+
from ragas.metrics.collections import SemanticSimilarity
11+
12+
13+
class TestSemanticSimilarityE2EMigration:
14+
"""E2E test compatibility between legacy SemanticSimilarity and new V2 SemanticSimilarity with automatic validation."""
15+
16+
@pytest.fixture
17+
def sample_data(self):
18+
"""Real-world test cases for semantic similarity evaluation."""
19+
return [
20+
{
21+
"reference": "Paris is the capital of France.",
22+
"response": "The capital of France is Paris.",
23+
"description": "Semantically similar with word reordering",
24+
},
25+
{
26+
"reference": "Python is a high-level programming language known for its simplicity and readability.",
27+
"response": "Python is a programming language that emphasizes code readability.",
28+
"description": "Similar content with paraphrasing",
29+
},
30+
{
31+
"reference": "Machine learning is a subset of artificial intelligence.",
32+
"response": "Deep learning uses neural networks with multiple layers.",
33+
"description": "Related but different concepts",
34+
},
35+
{
36+
"reference": "The quick brown fox jumps over the lazy dog.",
37+
"response": "A slow red cat walks under the active mouse.",
38+
"description": "Different content with similar structure",
39+
},
40+
{
41+
"reference": "",
42+
"response": "Some response text",
43+
"description": "Empty reference",
44+
},
45+
]
46+
47+
@pytest.fixture
48+
def test_legacy_embeddings(self):
49+
"""Create legacy embeddings for legacy implementation."""
50+
try:
51+
from ragas.embeddings.base import embedding_factory
52+
53+
return embedding_factory("text-embedding-ada-002")
54+
except ImportError as e:
55+
pytest.skip(f"Embedding factory not available: {e}")
56+
except Exception as e:
57+
pytest.skip(
58+
f"Could not create legacy embeddings (API key may be missing): {e}"
59+
)
60+
61+
@pytest.fixture
62+
def test_modern_embeddings(self):
63+
"""Create modern embeddings for v2 implementation."""
64+
try:
65+
import openai
66+
67+
from ragas.embeddings.base import embedding_factory
68+
69+
client = openai.AsyncOpenAI()
70+
71+
return embedding_factory(
72+
provider="openai",
73+
model="text-embedding-ada-002",
74+
client=client,
75+
interface="modern",
76+
)
77+
except ImportError as e:
78+
pytest.skip(f"OpenAI or embedding factory not available: {e}")
79+
except Exception as e:
80+
pytest.skip(
81+
f"Could not create modern embeddings (API key may be missing): {e}"
82+
)
83+
84+
@pytest.mark.asyncio
85+
async def test_legacy_semantic_similarity_vs_v2_semantic_similarity_e2e_compatibility(
86+
self,
87+
sample_data,
88+
test_legacy_embeddings,
89+
test_modern_embeddings,
90+
):
91+
"""E2E test that legacy and v2 implementations produce identical scores with real embeddings."""
92+
93+
if test_legacy_embeddings is None or test_modern_embeddings is None:
94+
pytest.skip("Embeddings required for E2E testing")
95+
96+
for i, data in enumerate(sample_data):
97+
print(
98+
f"\n🧪 Testing Semantic Similarity - Case {i + 1}: {data['description']}"
99+
)
100+
print(f" Reference: {data['reference'][:50]}...")
101+
print(f" Response: {data['response'][:50]}...")
102+
103+
legacy_semantic_similarity = LegacySemanticSimilarity(
104+
embeddings=test_legacy_embeddings
105+
)
106+
legacy_sample = SingleTurnSample(
107+
user_input="dummy",
108+
response=data["response"],
109+
reference=data["reference"],
110+
)
111+
legacy_score = await legacy_semantic_similarity._single_turn_ascore(
112+
legacy_sample, None
113+
)
114+
115+
v2_semantic_similarity = SemanticSimilarity(
116+
embeddings=test_modern_embeddings
117+
)
118+
v2_semantic_similarity_result = await v2_semantic_similarity.ascore(
119+
reference=data["reference"],
120+
response=data["response"],
121+
)
122+
123+
score_diff = abs(legacy_score - v2_semantic_similarity_result.value)
124+
print(f" Legacy: {legacy_score:.6f}")
125+
print(f" V2 Class: {v2_semantic_similarity_result.value:.6f}")
126+
print(f" Diff: {score_diff:.10f}")
127+
128+
assert score_diff < 1e-6, (
129+
f"Case {i + 1} ({data['description']}): Mismatch: {legacy_score} vs {v2_semantic_similarity_result.value}"
130+
)
131+
132+
assert isinstance(legacy_score, float)
133+
assert isinstance(v2_semantic_similarity_result, MetricResult)
134+
assert 0.0 <= legacy_score <= 1.0
135+
assert 0.0 <= v2_semantic_similarity_result.value <= 1.0
136+
137+
print(" ✅ Scores match!")
138+
139+
@pytest.mark.asyncio
140+
async def test_semantic_similarity_with_threshold(
141+
self, test_legacy_embeddings, test_modern_embeddings
142+
):
143+
"""Test that both implementations correctly handle threshold parameter."""
144+
145+
if test_legacy_embeddings is None or test_modern_embeddings is None:
146+
pytest.skip("Embeddings required for E2E testing")
147+
148+
test_cases = [
149+
{
150+
"reference": "Paris is the capital of France.",
151+
"response": "The capital of France is Paris.",
152+
"threshold": 0.9,
153+
"description": "High similarity with high threshold",
154+
},
155+
{
156+
"reference": "Machine learning is a subset of artificial intelligence.",
157+
"response": "Deep learning uses neural networks.",
158+
"threshold": 0.5,
159+
"description": "Different content with medium threshold",
160+
},
161+
]
162+
163+
for case in test_cases:
164+
print(f"\n🎯 Testing threshold: {case['description']}")
165+
166+
legacy_semantic_similarity = LegacySemanticSimilarity(
167+
embeddings=test_legacy_embeddings, threshold=case["threshold"]
168+
)
169+
legacy_sample = SingleTurnSample(
170+
user_input="dummy",
171+
response=case["response"],
172+
reference=case["reference"],
173+
)
174+
legacy_score = await legacy_semantic_similarity._single_turn_ascore(
175+
legacy_sample, None
176+
)
177+
178+
v2_semantic_similarity = SemanticSimilarity(
179+
embeddings=test_modern_embeddings, threshold=case["threshold"]
180+
)
181+
v2_result = await v2_semantic_similarity.ascore(
182+
reference=case["reference"],
183+
response=case["response"],
184+
)
185+
186+
print(f" Reference: {case['reference']}")
187+
print(f" Response: {case['response']}")
188+
print(f" Threshold: {case['threshold']}")
189+
print(f" Legacy: {legacy_score:.6f}")
190+
print(f" V2 Class: {v2_result.value:.6f}")
191+
192+
score_diff = abs(legacy_score - v2_result.value)
193+
assert score_diff < 1e-6, (
194+
f"Threshold test failed: {legacy_score} vs {v2_result.value}"
195+
)
196+
197+
assert legacy_score in [0.0, 1.0]
198+
assert v2_result.value in [0.0, 1.0]
199+
200+
print(" ✅ Threshold handling matches!")
201+
202+
@pytest.mark.asyncio
203+
async def test_v2_class_batch_processing(self, sample_data, test_modern_embeddings):
204+
"""Test V2 class-based SemanticSimilarity batch processing."""
205+
206+
if test_modern_embeddings is None:
207+
pytest.skip("Modern embeddings required for V2 testing")
208+
209+
metric = SemanticSimilarity(embeddings=test_modern_embeddings)
210+
211+
batch_inputs = [
212+
{"reference": case["reference"], "response": case["response"]}
213+
for case in sample_data[:3]
214+
]
215+
216+
print(f"\n📦 Testing V2 class batch processing with {len(batch_inputs)} items:")
217+
218+
results = await metric.abatch_score(batch_inputs)
219+
220+
assert len(results) == len(batch_inputs)
221+
222+
for i, (case, result) in enumerate(zip(sample_data[:3], results)):
223+
print(f" Case {i + 1}: {result.value:.6f} - {case['description']}")
224+
assert isinstance(result.value, float)
225+
assert 0.0 <= result.value <= 1.0
226+
assert result.reason is None
227+
228+
print(" ✅ V2 class batch processing works correctly!")
229+
230+
def test_semantic_similarity_migration_requirements_documented(self):
231+
"""Document the requirements for running full E2E semantic similarity tests."""
232+
233+
requirements = {
234+
"embeddings": "OpenAI embeddings, HuggingFace embeddings, or similar",
235+
"environment": "API keys configured for embedding providers",
236+
"purpose": "Verify that v2 class-based implementation produces identical results to legacy implementation",
237+
}
238+
239+
print("\n📋 Semantic Similarity E2E Test Requirements:")
240+
for key, value in requirements.items():
241+
print(f" {key.capitalize()}: {value}")
242+
243+
print("\n🚀 To enable full E2E testing:")
244+
print(" 1. Configure embedding provider (e.g., export OPENAI_API_KEY=...)")
245+
print(" 2. Remove @pytest.mark.skip decorators")
246+
print(
247+
" 3. Run: pytest tests/e2e/metrics_migration/test_semantic_similarity_migration.py -v -s"
248+
)
249+
250+
assert True

0 commit comments

Comments
 (0)