Skip to content

Commit bc3b19f

Browse files
committed
refactor the tests
1 parent 75b36e0 commit bc3b19f

File tree

7 files changed

+1301
-56
lines changed

7 files changed

+1301
-56
lines changed
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
"""Base test class for metrics migration E2E tests."""
2+
3+
from typing import Any, Callable, Dict, List, Optional
4+
5+
import pytest
6+
7+
from .test_utils import (
8+
assert_score_types,
9+
compare_scores_with_tolerance,
10+
create_legacy_sample,
11+
print_score_comparison,
12+
print_test_header,
13+
print_test_success,
14+
)
15+
16+
17+
class BaseMigrationTest:
18+
"""Base class for metrics migration E2E tests.
19+
20+
Provides common functionality for testing compatibility between legacy and v2 implementations.
21+
Subclasses should implement metric-specific test data and configurations.
22+
"""
23+
24+
@pytest.mark.asyncio
25+
async def run_e2e_compatibility_test(
26+
self,
27+
sample_data: List[Dict[str, Any]],
28+
legacy_metric_factory: Callable,
29+
v2_metric_factory: Callable,
30+
v2_score_method_name: str = "ascore",
31+
legacy_components: Optional[Dict[str, Any]] = None,
32+
v2_components: Optional[Dict[str, Any]] = None,
33+
tolerance: float = 0.3,
34+
metric_name: str = "Metric",
35+
additional_info_keys: Optional[List[str]] = None,
36+
) -> None:
37+
"""Run E2E compatibility test between legacy and v2 implementations.
38+
39+
Args:
40+
sample_data: List of test cases, each as a dictionary
41+
legacy_metric_factory: Function to create legacy metric instance
42+
v2_metric_factory: Function to create v2 metric instance
43+
v2_score_method_name: Name of the scoring method on v2 metric
44+
legacy_components: Components for legacy metric (llm, embeddings, etc.)
45+
v2_components: Components for v2 metric (llm, embeddings, etc.)
46+
tolerance: Maximum allowed score difference
47+
metric_name: Name of the metric for display
48+
additional_info_keys: Keys from data dict to display in test output
49+
"""
50+
# Check if required components are available
51+
if legacy_components:
52+
if any(component is None for component in legacy_components.values()):
53+
pytest.skip("Required components not available for E2E testing")
54+
55+
if v2_components:
56+
if any(component is None for component in v2_components.values()):
57+
pytest.skip("Required components not available for E2E testing")
58+
59+
# Create metric instances
60+
legacy_metric = (
61+
legacy_metric_factory(**legacy_components)
62+
if legacy_components
63+
else legacy_metric_factory()
64+
)
65+
v2_metric = (
66+
v2_metric_factory(**v2_components) if v2_components else v2_metric_factory()
67+
)
68+
69+
# Run tests for each sample
70+
for i, data in enumerate(sample_data):
71+
description = data.get("description", "No description")
72+
73+
# Prepare additional info for display
74+
additional_info = {}
75+
if additional_info_keys:
76+
for key in additional_info_keys:
77+
if key in data:
78+
additional_info[key.replace("_", " ").title()] = str(data[key])
79+
80+
print_test_header(metric_name, i + 1, description, additional_info)
81+
82+
# Score with legacy implementation
83+
legacy_sample = create_legacy_sample(data)
84+
legacy_score = await legacy_metric._single_turn_ascore(legacy_sample, None)
85+
86+
# Score with v2 implementation
87+
# Extract parameters for v2 scoring (exclude metadata keys)
88+
v2_params = {k: v for k, v in data.items() if k != "description"}
89+
v2_score_method = getattr(v2_metric, v2_score_method_name)
90+
v2_result = await v2_score_method(**v2_params)
91+
92+
# Compare scores
93+
print_score_comparison(legacy_score, v2_result.value)
94+
95+
# Assert scores are within tolerance
96+
compare_scores_with_tolerance(
97+
legacy_score,
98+
v2_result.value,
99+
tolerance,
100+
description,
101+
i + 1,
102+
)
103+
104+
# Assert types and ranges
105+
assert_score_types(legacy_score, v2_result)
106+
107+
print_test_success()
108+
109+
@pytest.mark.asyncio
110+
async def run_metric_specific_test(
111+
self,
112+
test_cases: List[Dict[str, Any]],
113+
legacy_metric_factory: Callable,
114+
v2_metric_factory: Callable,
115+
legacy_components: Optional[Dict[str, Any]] = None,
116+
v2_components: Optional[Dict[str, Any]] = None,
117+
test_name: str = "Metric Specific Test",
118+
assertion_fn: Optional[Callable] = None,
119+
) -> None:
120+
"""Run a metric-specific test with custom assertions.
121+
122+
Args:
123+
test_cases: List of test cases
124+
legacy_metric_factory: Function to create legacy metric instance
125+
v2_metric_factory: Function to create v2 metric instance
126+
legacy_components: Components for legacy metric
127+
v2_components: Components for v2 metric
128+
test_name: Name of the test for display
129+
assertion_fn: Optional custom assertion function that takes (case, legacy_score, v2_result)
130+
"""
131+
# Check if required components are available
132+
if legacy_components:
133+
if any(component is None for component in legacy_components.values()):
134+
pytest.skip("Required components not available for testing")
135+
136+
if v2_components:
137+
if any(component is None for component in v2_components.values()):
138+
pytest.skip("Required components not available for testing")
139+
140+
# Create metric instances
141+
legacy_metric = (
142+
legacy_metric_factory(**legacy_components)
143+
if legacy_components
144+
else legacy_metric_factory()
145+
)
146+
v2_metric = (
147+
v2_metric_factory(**v2_components) if v2_components else v2_metric_factory()
148+
)
149+
150+
# Run tests for each case
151+
for case in test_cases:
152+
description = case.get("description", "No description")
153+
print(f"\n🎯 Testing {test_name}: {description}")
154+
155+
# Score with legacy implementation
156+
legacy_sample = create_legacy_sample(case)
157+
legacy_score = await legacy_metric._single_turn_ascore(legacy_sample, None)
158+
159+
# Score with v2 implementation
160+
v2_params = {
161+
k: v
162+
for k, v in case.items()
163+
if k not in ["description", "expected_high", "expected_low"]
164+
}
165+
v2_result = await v2_metric.ascore(**v2_params)
166+
167+
# Print scores
168+
print_score_comparison(legacy_score, v2_result.value)
169+
170+
# Run custom assertions if provided
171+
if assertion_fn:
172+
assertion_fn(case, legacy_score, v2_result)
173+
else:
174+
# Default: just verify types
175+
assert_score_types(legacy_score, v2_result)
176+
177+
def create_requirements_documentation(
178+
self,
179+
metric_name: str,
180+
requirements: Dict[str, str],
181+
test_file_name: str,
182+
) -> None:
183+
"""Print documentation about E2E test requirements.
184+
185+
Args:
186+
metric_name: Name of the metric
187+
requirements: Dictionary of requirements
188+
test_file_name: Name of the test file
189+
"""
190+
print(f"\n📋 {metric_name} E2E Test Requirements:")
191+
for key, value in requirements.items():
192+
print(f" {key.capitalize()}: {value}")
193+
194+
print("\n🚀 To enable full E2E testing:")
195+
print(" 1. Configure required providers (e.g., export OPENAI_API_KEY=...)")
196+
print(" 2. Remove @pytest.mark.skip decorators")
197+
print(f" 3. Run: pytest tests/e2e/metrics_migration/{test_file_name} -v -s")
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""Common fixtures for metrics migration E2E tests.
2+
3+
This module provides pytest fixtures that wrap the shared utility functions
4+
from tests.utils.llm_setup for use in E2E migration tests.
5+
"""
6+
7+
import pytest
8+
9+
from tests.utils import (
10+
create_legacy_embeddings,
11+
create_legacy_llm,
12+
create_modern_embeddings,
13+
create_modern_llm,
14+
)
15+
16+
17+
@pytest.fixture
18+
def legacy_llm():
19+
"""Create a test LLM for legacy metric evaluation.
20+
21+
Uses legacy llm_factory for legacy implementation.
22+
Skips if LLM factory is not available or API key is missing.
23+
"""
24+
try:
25+
return create_legacy_llm("gpt-3.5-turbo")
26+
except Exception as e:
27+
pytest.skip(str(e))
28+
29+
30+
@pytest.fixture
31+
def modern_llm():
32+
"""Create a modern instructor LLM for v2 implementation.
33+
34+
Uses instructor_llm_factory with OpenAI client.
35+
Skips if instructor LLM factory is not available or API key is missing.
36+
"""
37+
try:
38+
return create_modern_llm("openai", model="gpt-3.5-turbo")
39+
except Exception as e:
40+
pytest.skip(str(e))
41+
42+
43+
@pytest.fixture
44+
def legacy_embeddings():
45+
"""Create legacy embeddings for legacy implementation.
46+
47+
Uses legacy embedding_factory interface.
48+
Skips if embedding factory is not available or API key is missing.
49+
"""
50+
try:
51+
return create_legacy_embeddings("text-embedding-ada-002")
52+
except Exception as e:
53+
pytest.skip(str(e))
54+
55+
56+
@pytest.fixture
57+
def modern_embeddings():
58+
"""Create modern embeddings for v2 implementation.
59+
60+
Uses modern interface with explicit provider and client.
61+
Skips if OpenAI or embedding factory is not available or API key is missing.
62+
"""
63+
try:
64+
return create_modern_embeddings(
65+
provider="openai",
66+
model="text-embedding-ada-002",
67+
)
68+
except Exception as e:
69+
pytest.skip(str(e))

0 commit comments

Comments
 (0)