|
| 1 | +"""E2E tests for Answer Similarity metric migration from v1 to v2 (class-based).""" |
| 2 | + |
| 3 | +import pytest |
| 4 | + |
| 5 | +from ragas.dataset_schema import SingleTurnSample |
| 6 | +from ragas.metrics import AnswerSimilarity as LegacyAnswerSimilarity, MetricResult |
| 7 | +from ragas.metrics.collections import AnswerSimilarity |
| 8 | + |
| 9 | + |
| 10 | +class TestAnswerSimilarityE2EMigration: |
| 11 | + """E2E test compatibility between legacy AnswerSimilarity and new V2 AnswerSimilarity with automatic validation.""" |
| 12 | + |
| 13 | + @pytest.fixture |
| 14 | + def sample_data(self): |
| 15 | + """Real-world test cases for answer similarity evaluation.""" |
| 16 | + return [ |
| 17 | + { |
| 18 | + "reference": "Paris is the capital of France.", |
| 19 | + "response": "The capital of France is Paris.", |
| 20 | + "description": "Semantically similar with word reordering", |
| 21 | + }, |
| 22 | + { |
| 23 | + "reference": "Python is a high-level programming language known for its simplicity and readability.", |
| 24 | + "response": "Python is a programming language that emphasizes code readability.", |
| 25 | + "description": "Similar content with paraphrasing", |
| 26 | + }, |
| 27 | + { |
| 28 | + "reference": "Machine learning is a subset of artificial intelligence.", |
| 29 | + "response": "Deep learning uses neural networks with multiple layers.", |
| 30 | + "description": "Related but different concepts", |
| 31 | + }, |
| 32 | + { |
| 33 | + "reference": "The quick brown fox jumps over the lazy dog.", |
| 34 | + "response": "A slow red cat walks under the active mouse.", |
| 35 | + "description": "Different content with similar structure", |
| 36 | + }, |
| 37 | + { |
| 38 | + "reference": "", |
| 39 | + "response": "Some response text", |
| 40 | + "description": "Empty reference", |
| 41 | + }, |
| 42 | + ] |
| 43 | + |
| 44 | + @pytest.fixture |
| 45 | + def test_legacy_embeddings(self): |
| 46 | + """Create legacy embeddings for legacy implementation.""" |
| 47 | + try: |
| 48 | + from ragas.embeddings.base import embedding_factory |
| 49 | + |
| 50 | + return embedding_factory("text-embedding-ada-002") |
| 51 | + except ImportError as e: |
| 52 | + pytest.skip(f"Embedding factory not available: {e}") |
| 53 | + except Exception as e: |
| 54 | + pytest.skip( |
| 55 | + f"Could not create legacy embeddings (API key may be missing): {e}" |
| 56 | + ) |
| 57 | + |
| 58 | + @pytest.fixture |
| 59 | + def test_modern_embeddings(self): |
| 60 | + """Create modern embeddings for v2 implementation.""" |
| 61 | + try: |
| 62 | + import openai |
| 63 | + |
| 64 | + from ragas.embeddings.base import embedding_factory |
| 65 | + |
| 66 | + client = openai.AsyncOpenAI() |
| 67 | + |
| 68 | + return embedding_factory( |
| 69 | + provider="openai", |
| 70 | + model="text-embedding-ada-002", |
| 71 | + client=client, |
| 72 | + interface="modern", |
| 73 | + ) |
| 74 | + except ImportError as e: |
| 75 | + pytest.skip(f"OpenAI or embedding factory not available: {e}") |
| 76 | + except Exception as e: |
| 77 | + pytest.skip( |
| 78 | + f"Could not create modern embeddings (API key may be missing): {e}" |
| 79 | + ) |
| 80 | + |
| 81 | + @pytest.mark.asyncio |
| 82 | + async def test_legacy_answer_similarity_vs_v2_answer_similarity_e2e_compatibility( |
| 83 | + self, |
| 84 | + sample_data, |
| 85 | + test_legacy_embeddings, |
| 86 | + test_modern_embeddings, |
| 87 | + ): |
| 88 | + """E2E test that legacy and v2 implementations produce identical scores with real embeddings.""" |
| 89 | + |
| 90 | + if test_legacy_embeddings is None or test_modern_embeddings is None: |
| 91 | + pytest.skip("Embeddings required for E2E testing") |
| 92 | + |
| 93 | + for i, data in enumerate(sample_data): |
| 94 | + print( |
| 95 | + f"\n🧪 Testing Answer Similarity - Case {i + 1}: {data['description']}" |
| 96 | + ) |
| 97 | + print(f" Reference: {data['reference'][:50]}...") |
| 98 | + print(f" Response: {data['response'][:50]}...") |
| 99 | + |
| 100 | + legacy_answer_similarity = LegacyAnswerSimilarity( |
| 101 | + embeddings=test_legacy_embeddings |
| 102 | + ) |
| 103 | + legacy_sample = SingleTurnSample( |
| 104 | + user_input="dummy", |
| 105 | + response=data["response"], |
| 106 | + reference=data["reference"], |
| 107 | + ) |
| 108 | + legacy_score = await legacy_answer_similarity._single_turn_ascore( |
| 109 | + legacy_sample, None |
| 110 | + ) |
| 111 | + |
| 112 | + v2_answer_similarity = AnswerSimilarity(embeddings=test_modern_embeddings) |
| 113 | + v2_answer_similarity_result = await v2_answer_similarity.ascore( |
| 114 | + reference=data["reference"], |
| 115 | + response=data["response"], |
| 116 | + ) |
| 117 | + |
| 118 | + score_diff = abs(legacy_score - v2_answer_similarity_result.value) |
| 119 | + print(f" Legacy: {legacy_score:.6f}") |
| 120 | + print(f" V2 Class: {v2_answer_similarity_result.value:.6f}") |
| 121 | + print(f" Diff: {score_diff:.10f}") |
| 122 | + |
| 123 | + assert score_diff < 1e-6, ( |
| 124 | + f"Case {i + 1} ({data['description']}): Mismatch: {legacy_score} vs {v2_answer_similarity_result.value}" |
| 125 | + ) |
| 126 | + |
| 127 | + assert isinstance(legacy_score, float) |
| 128 | + assert isinstance(v2_answer_similarity_result, MetricResult) |
| 129 | + assert 0.0 <= legacy_score <= 1.0 |
| 130 | + assert 0.0 <= v2_answer_similarity_result.value <= 1.0 |
| 131 | + |
| 132 | + print(" ✅ Scores match!") |
| 133 | + |
| 134 | + @pytest.mark.asyncio |
| 135 | + async def test_answer_similarity_with_threshold( |
| 136 | + self, test_legacy_embeddings, test_modern_embeddings |
| 137 | + ): |
| 138 | + """Test that both implementations correctly handle threshold parameter.""" |
| 139 | + |
| 140 | + if test_legacy_embeddings is None or test_modern_embeddings is None: |
| 141 | + pytest.skip("Embeddings required for E2E testing") |
| 142 | + |
| 143 | + test_cases = [ |
| 144 | + { |
| 145 | + "reference": "Paris is the capital of France.", |
| 146 | + "response": "The capital of France is Paris.", |
| 147 | + "threshold": 0.9, |
| 148 | + "description": "High similarity with high threshold", |
| 149 | + }, |
| 150 | + { |
| 151 | + "reference": "Machine learning is a subset of artificial intelligence.", |
| 152 | + "response": "Deep learning uses neural networks.", |
| 153 | + "threshold": 0.5, |
| 154 | + "description": "Different content with medium threshold", |
| 155 | + }, |
| 156 | + ] |
| 157 | + |
| 158 | + for case in test_cases: |
| 159 | + print(f"\n🎯 Testing threshold: {case['description']}") |
| 160 | + |
| 161 | + legacy_answer_similarity = LegacyAnswerSimilarity( |
| 162 | + embeddings=test_legacy_embeddings, threshold=case["threshold"] |
| 163 | + ) |
| 164 | + legacy_sample = SingleTurnSample( |
| 165 | + user_input="dummy", |
| 166 | + response=case["response"], |
| 167 | + reference=case["reference"], |
| 168 | + ) |
| 169 | + legacy_score = await legacy_answer_similarity._single_turn_ascore( |
| 170 | + legacy_sample, None |
| 171 | + ) |
| 172 | + |
| 173 | + v2_answer_similarity = AnswerSimilarity( |
| 174 | + embeddings=test_modern_embeddings, threshold=case["threshold"] |
| 175 | + ) |
| 176 | + v2_result = await v2_answer_similarity.ascore( |
| 177 | + reference=case["reference"], |
| 178 | + response=case["response"], |
| 179 | + ) |
| 180 | + |
| 181 | + print(f" Reference: {case['reference']}") |
| 182 | + print(f" Response: {case['response']}") |
| 183 | + print(f" Threshold: {case['threshold']}") |
| 184 | + print(f" Legacy: {legacy_score:.6f}") |
| 185 | + print(f" V2 Class: {v2_result.value:.6f}") |
| 186 | + |
| 187 | + score_diff = abs(legacy_score - v2_result.value) |
| 188 | + assert score_diff < 1e-6, ( |
| 189 | + f"Threshold test failed: {legacy_score} vs {v2_result.value}" |
| 190 | + ) |
| 191 | + |
| 192 | + assert legacy_score in [0.0, 1.0] |
| 193 | + assert v2_result.value in [0.0, 1.0] |
| 194 | + |
| 195 | + print(" ✅ Threshold handling matches!") |
| 196 | + |
| 197 | + @pytest.mark.asyncio |
| 198 | + async def test_v2_class_batch_processing(self, sample_data, test_modern_embeddings): |
| 199 | + """Test V2 class-based AnswerSimilarity batch processing.""" |
| 200 | + |
| 201 | + if test_modern_embeddings is None: |
| 202 | + pytest.skip("Modern embeddings required for V2 testing") |
| 203 | + |
| 204 | + metric = AnswerSimilarity(embeddings=test_modern_embeddings) |
| 205 | + |
| 206 | + batch_inputs = [ |
| 207 | + {"reference": case["reference"], "response": case["response"]} |
| 208 | + for case in sample_data[:3] |
| 209 | + ] |
| 210 | + |
| 211 | + print(f"\n📦 Testing V2 class batch processing with {len(batch_inputs)} items:") |
| 212 | + |
| 213 | + results = await metric.abatch_score(batch_inputs) |
| 214 | + |
| 215 | + assert len(results) == len(batch_inputs) |
| 216 | + |
| 217 | + for i, (case, result) in enumerate(zip(sample_data[:3], results)): |
| 218 | + print(f" Case {i + 1}: {result.value:.6f} - {case['description']}") |
| 219 | + assert isinstance(result.value, float) |
| 220 | + assert 0.0 <= result.value <= 1.0 |
| 221 | + assert result.reason is None |
| 222 | + |
| 223 | + print(" ✅ V2 class batch processing works correctly!") |
| 224 | + |
| 225 | + def test_answer_similarity_migration_requirements_documented(self): |
| 226 | + """Document the requirements for running full E2E answer similarity tests.""" |
| 227 | + |
| 228 | + requirements = { |
| 229 | + "embeddings": "OpenAI embeddings, HuggingFace embeddings, or similar", |
| 230 | + "environment": "API keys configured for embedding providers", |
| 231 | + "purpose": "Verify that v2 class-based implementation produces identical results to legacy implementation", |
| 232 | + } |
| 233 | + |
| 234 | + print("\n📋 Answer Similarity E2E Test Requirements:") |
| 235 | + for key, value in requirements.items(): |
| 236 | + print(f" {key.capitalize()}: {value}") |
| 237 | + |
| 238 | + print("\n🚀 To enable full E2E testing:") |
| 239 | + print(" 1. Configure embedding provider (e.g., export OPENAI_API_KEY=...)") |
| 240 | + print(" 2. Remove @pytest.mark.skip decorators") |
| 241 | + print( |
| 242 | + " 3. Run: pytest tests/e2e/metrics_migration/test_answer_similarity_migration.py -v -s" |
| 243 | + ) |
| 244 | + |
| 245 | + assert True |
0 commit comments