|
| 1 | +"""E2E tests for Context Recall metric migration from v1 (class-based) to v2 (class-based with automatic validation).""" |
| 2 | + |
| 3 | +import pytest |
| 4 | + |
| 5 | +from ragas.metrics import LLMContextRecall as LegacyContextRecall |
| 6 | +from ragas.metrics.collections import ContextRecall |
| 7 | + |
| 8 | +from .base_migration_test import BaseMigrationTest |
| 9 | + |
| 10 | + |
| 11 | +class TestContextRecallE2EMigration(BaseMigrationTest): |
| 12 | + """E2E test compatibility between legacy ContextRecall class and new V2 ContextRecall class with automatic validation.""" |
| 13 | + |
| 14 | + @pytest.fixture |
| 15 | + def sample_data(self): |
| 16 | + """Real-world test cases for context recall evaluation.""" |
| 17 | + return [ |
| 18 | + { |
| 19 | + "user_input": "What is the capital of France?", |
| 20 | + "retrieved_contexts": [ |
| 21 | + "Paris is the capital and largest city of France.", |
| 22 | + "France is a country in Western Europe.", |
| 23 | + ], |
| 24 | + "reference": "Paris is the capital of France. It is located in northern France.", |
| 25 | + "description": "Full attribution - all statements should be found in context", |
| 26 | + }, |
| 27 | + { |
| 28 | + "user_input": "Tell me about Albert Einstein", |
| 29 | + "retrieved_contexts": [ |
| 30 | + "Albert Einstein was born in 1879. He developed the theory of relativity." |
| 31 | + ], |
| 32 | + "reference": "Einstein was born in 1879. He won the Nobel Prize in 1921. He developed relativity theory.", |
| 33 | + "description": "Partial attribution - Nobel Prize not mentioned in context", |
| 34 | + }, |
| 35 | + { |
| 36 | + "user_input": "What are the main causes of climate change?", |
| 37 | + "retrieved_contexts": [ |
| 38 | + "Climate change is primarily caused by greenhouse gas emissions from burning fossil fuels.", |
| 39 | + "Deforestation also contributes to climate change by reducing CO2 absorption.", |
| 40 | + ], |
| 41 | + "reference": "The main causes include fossil fuel emissions and deforestation.", |
| 42 | + "description": "Multiple contexts - all statements attributed", |
| 43 | + }, |
| 44 | + { |
| 45 | + "user_input": "How does photosynthesis work?", |
| 46 | + "retrieved_contexts": [ |
| 47 | + "Photosynthesis is a process where plants use sunlight to produce glucose." |
| 48 | + ], |
| 49 | + "reference": "Plants convert sunlight into glucose through photosynthesis. This process also produces oxygen and occurs in chloroplasts.", |
| 50 | + "description": "Partial attribution - oxygen and chloroplasts not in context", |
| 51 | + }, |
| 52 | + { |
| 53 | + "user_input": "What is quantum computing?", |
| 54 | + "retrieved_contexts": [ |
| 55 | + "Quantum computers use quantum bits or qubits instead of classical bits." |
| 56 | + ], |
| 57 | + "reference": "Quantum computing uses qubits.", |
| 58 | + "description": "Simple case - direct attribution", |
| 59 | + }, |
| 60 | + ] |
| 61 | + |
| 62 | + @pytest.mark.asyncio |
| 63 | + async def test_legacy_context_recall_vs_v2_context_recall_e2e_compatibility( |
| 64 | + self, |
| 65 | + sample_data, |
| 66 | + legacy_llm, |
| 67 | + modern_llm, |
| 68 | + ): |
| 69 | + """E2E test that legacy and v2 implementations produce similar scores with real LLM.""" |
| 70 | + await self.run_e2e_compatibility_test( |
| 71 | + sample_data=sample_data, |
| 72 | + legacy_metric_factory=LegacyContextRecall, |
| 73 | + v2_metric_factory=ContextRecall, |
| 74 | + legacy_components={"llm": legacy_llm}, |
| 75 | + v2_components={"llm": modern_llm}, |
| 76 | + tolerance=0.3, |
| 77 | + metric_name="Context Recall", |
| 78 | + additional_info_keys=["user_input", "reference"], |
| 79 | + ) |
| 80 | + |
| 81 | + @pytest.mark.asyncio |
| 82 | + async def test_context_recall_attribution_detection(self, legacy_llm, modern_llm): |
| 83 | + """Test that both implementations correctly detect statement attributions.""" |
| 84 | + |
| 85 | + if legacy_llm is None or modern_llm is None: |
| 86 | + pytest.skip("LLM required for E2E testing") |
| 87 | + |
| 88 | + # Test cases specifically for attribution detection |
| 89 | + test_cases = [ |
| 90 | + { |
| 91 | + "user_input": "What is the capital of France?", |
| 92 | + "retrieved_contexts": ["Paris is the capital of France."], |
| 93 | + "reference": "Paris is the capital of France.", |
| 94 | + "expected_high": True, |
| 95 | + "description": "Perfect attribution - should get high score", |
| 96 | + }, |
| 97 | + { |
| 98 | + "user_input": "What is the capital of France?", |
| 99 | + "retrieved_contexts": ["France is a European country."], |
| 100 | + "reference": "Paris is the capital of France.", |
| 101 | + "expected_high": False, |
| 102 | + "description": "No attribution - should get low score", |
| 103 | + }, |
| 104 | + { |
| 105 | + "user_input": "Tell me about Einstein", |
| 106 | + "retrieved_contexts": ["Einstein was born in 1879."], |
| 107 | + "reference": "Einstein was born in 1879. He won the Nobel Prize.", |
| 108 | + "expected_high": False, |
| 109 | + "description": "Partial attribution - should get medium score (50%)", |
| 110 | + }, |
| 111 | + ] |
| 112 | + |
| 113 | + # Define custom assertion function |
| 114 | + def assertion_fn(case, legacy_score, v2_result): |
| 115 | + print(f" Reference: {case['reference']}") |
| 116 | + |
| 117 | + if case.get("expected_high"): |
| 118 | + # High attribution should get high scores (> 0.8) |
| 119 | + assert legacy_score > 0.8, ( |
| 120 | + f"Legacy should detect high attribution: {legacy_score}" |
| 121 | + ) |
| 122 | + assert v2_result.value > 0.8, ( |
| 123 | + f"V2 class should detect high attribution: {v2_result.value}" |
| 124 | + ) |
| 125 | + print(" ✅ All detected high attribution") |
| 126 | + else: |
| 127 | + # Low/partial attribution should get lower scores |
| 128 | + # Note: We don't enforce strict thresholds here as it depends on the specific case |
| 129 | + print( |
| 130 | + f" ✅ Scores reflect attribution level (Legacy: {legacy_score:.2f}, V2: {v2_result.value:.2f})" |
| 131 | + ) |
| 132 | + |
| 133 | + await self.run_metric_specific_test( |
| 134 | + test_cases=test_cases, |
| 135 | + legacy_metric_factory=LegacyContextRecall, |
| 136 | + v2_metric_factory=ContextRecall, |
| 137 | + legacy_components={"llm": legacy_llm}, |
| 138 | + v2_components={"llm": modern_llm}, |
| 139 | + test_name="attribution detection", |
| 140 | + assertion_fn=assertion_fn, |
| 141 | + ) |
| 142 | + |
| 143 | + def test_context_recall_migration_requirements_documented(self): |
| 144 | + """Document the requirements for running full E2E context recall tests.""" |
| 145 | + |
| 146 | + requirements = { |
| 147 | + "llm": "OpenAI GPT, Anthropic Claude, or other LangChain-compatible LLM", |
| 148 | + "environment": "API keys configured for LLM providers", |
| 149 | + "purpose": "Verify that v2 class-based implementation with automatic validation produces similar results to legacy class-based implementation", |
| 150 | + } |
| 151 | + |
| 152 | + self.create_requirements_documentation( |
| 153 | + metric_name="Context Recall", |
| 154 | + requirements=requirements, |
| 155 | + test_file_name="test_context_recall_migration.py", |
| 156 | + ) |
| 157 | + |
| 158 | + assert True |
0 commit comments