Skip to content

Commit 39ac4d6

Browse files
committed
Add context recall migration tests
1 parent b499c54 commit 39ac4d6

File tree

1 file changed

+158
-0
lines changed

1 file changed

+158
-0
lines changed
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
"""E2E tests for Context Recall metric migration from v1 (class-based) to v2 (class-based with automatic validation)."""
2+
3+
import pytest
4+
5+
from ragas.metrics import LLMContextRecall as LegacyContextRecall
6+
from ragas.metrics.collections import ContextRecall
7+
8+
from .base_migration_test import BaseMigrationTest
9+
10+
11+
class TestContextRecallE2EMigration(BaseMigrationTest):
12+
"""E2E test compatibility between legacy ContextRecall class and new V2 ContextRecall class with automatic validation."""
13+
14+
@pytest.fixture
15+
def sample_data(self):
16+
"""Real-world test cases for context recall evaluation."""
17+
return [
18+
{
19+
"user_input": "What is the capital of France?",
20+
"retrieved_contexts": [
21+
"Paris is the capital and largest city of France.",
22+
"France is a country in Western Europe.",
23+
],
24+
"reference": "Paris is the capital of France. It is located in northern France.",
25+
"description": "Full attribution - all statements should be found in context",
26+
},
27+
{
28+
"user_input": "Tell me about Albert Einstein",
29+
"retrieved_contexts": [
30+
"Albert Einstein was born in 1879. He developed the theory of relativity."
31+
],
32+
"reference": "Einstein was born in 1879. He won the Nobel Prize in 1921. He developed relativity theory.",
33+
"description": "Partial attribution - Nobel Prize not mentioned in context",
34+
},
35+
{
36+
"user_input": "What are the main causes of climate change?",
37+
"retrieved_contexts": [
38+
"Climate change is primarily caused by greenhouse gas emissions from burning fossil fuels.",
39+
"Deforestation also contributes to climate change by reducing CO2 absorption.",
40+
],
41+
"reference": "The main causes include fossil fuel emissions and deforestation.",
42+
"description": "Multiple contexts - all statements attributed",
43+
},
44+
{
45+
"user_input": "How does photosynthesis work?",
46+
"retrieved_contexts": [
47+
"Photosynthesis is a process where plants use sunlight to produce glucose."
48+
],
49+
"reference": "Plants convert sunlight into glucose through photosynthesis. This process also produces oxygen and occurs in chloroplasts.",
50+
"description": "Partial attribution - oxygen and chloroplasts not in context",
51+
},
52+
{
53+
"user_input": "What is quantum computing?",
54+
"retrieved_contexts": [
55+
"Quantum computers use quantum bits or qubits instead of classical bits."
56+
],
57+
"reference": "Quantum computing uses qubits.",
58+
"description": "Simple case - direct attribution",
59+
},
60+
]
61+
62+
@pytest.mark.asyncio
63+
async def test_legacy_context_recall_vs_v2_context_recall_e2e_compatibility(
64+
self,
65+
sample_data,
66+
legacy_llm,
67+
modern_llm,
68+
):
69+
"""E2E test that legacy and v2 implementations produce similar scores with real LLM."""
70+
await self.run_e2e_compatibility_test(
71+
sample_data=sample_data,
72+
legacy_metric_factory=LegacyContextRecall,
73+
v2_metric_factory=ContextRecall,
74+
legacy_components={"llm": legacy_llm},
75+
v2_components={"llm": modern_llm},
76+
tolerance=0.3,
77+
metric_name="Context Recall",
78+
additional_info_keys=["user_input", "reference"],
79+
)
80+
81+
@pytest.mark.asyncio
82+
async def test_context_recall_attribution_detection(self, legacy_llm, modern_llm):
83+
"""Test that both implementations correctly detect statement attributions."""
84+
85+
if legacy_llm is None or modern_llm is None:
86+
pytest.skip("LLM required for E2E testing")
87+
88+
# Test cases specifically for attribution detection
89+
test_cases = [
90+
{
91+
"user_input": "What is the capital of France?",
92+
"retrieved_contexts": ["Paris is the capital of France."],
93+
"reference": "Paris is the capital of France.",
94+
"expected_high": True,
95+
"description": "Perfect attribution - should get high score",
96+
},
97+
{
98+
"user_input": "What is the capital of France?",
99+
"retrieved_contexts": ["France is a European country."],
100+
"reference": "Paris is the capital of France.",
101+
"expected_high": False,
102+
"description": "No attribution - should get low score",
103+
},
104+
{
105+
"user_input": "Tell me about Einstein",
106+
"retrieved_contexts": ["Einstein was born in 1879."],
107+
"reference": "Einstein was born in 1879. He won the Nobel Prize.",
108+
"expected_high": False,
109+
"description": "Partial attribution - should get medium score (50%)",
110+
},
111+
]
112+
113+
# Define custom assertion function
114+
def assertion_fn(case, legacy_score, v2_result):
115+
print(f" Reference: {case['reference']}")
116+
117+
if case.get("expected_high"):
118+
# High attribution should get high scores (> 0.8)
119+
assert legacy_score > 0.8, (
120+
f"Legacy should detect high attribution: {legacy_score}"
121+
)
122+
assert v2_result.value > 0.8, (
123+
f"V2 class should detect high attribution: {v2_result.value}"
124+
)
125+
print(" ✅ All detected high attribution")
126+
else:
127+
# Low/partial attribution should get lower scores
128+
# Note: We don't enforce strict thresholds here as it depends on the specific case
129+
print(
130+
f" ✅ Scores reflect attribution level (Legacy: {legacy_score:.2f}, V2: {v2_result.value:.2f})"
131+
)
132+
133+
await self.run_metric_specific_test(
134+
test_cases=test_cases,
135+
legacy_metric_factory=LegacyContextRecall,
136+
v2_metric_factory=ContextRecall,
137+
legacy_components={"llm": legacy_llm},
138+
v2_components={"llm": modern_llm},
139+
test_name="attribution detection",
140+
assertion_fn=assertion_fn,
141+
)
142+
143+
def test_context_recall_migration_requirements_documented(self):
144+
"""Document the requirements for running full E2E context recall tests."""
145+
146+
requirements = {
147+
"llm": "OpenAI GPT, Anthropic Claude, or other LangChain-compatible LLM",
148+
"environment": "API keys configured for LLM providers",
149+
"purpose": "Verify that v2 class-based implementation with automatic validation produces similar results to legacy class-based implementation",
150+
}
151+
152+
self.create_requirements_documentation(
153+
metric_name="Context Recall",
154+
requirements=requirements,
155+
test_file_name="test_context_recall_migration.py",
156+
)
157+
158+
assert True

0 commit comments

Comments
 (0)