diff --git a/src/ragas/metrics/collections/__init__.py b/src/ragas/metrics/collections/__init__.py index 1c6061916..04cfdd22c 100644 --- a/src/ragas/metrics/collections/__init__.py +++ b/src/ragas/metrics/collections/__init__.py @@ -1,5 +1,6 @@ """Collections of metrics using modern component architecture.""" +from ragas.metrics.collections._answer_accuracy import AnswerAccuracy from ragas.metrics.collections._answer_correctness import AnswerCorrectness from ragas.metrics.collections._answer_relevancy import AnswerRelevancy from ragas.metrics.collections._answer_similarity import AnswerSimilarity @@ -13,6 +14,7 @@ ) from ragas.metrics.collections._bleu_score import BleuScore from ragas.metrics.collections._context_entity_recall import ContextEntityRecall +from ragas.metrics.collections._context_relevance import ContextRelevance from ragas.metrics.collections._faithfulness import Faithfulness from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity from ragas.metrics.collections._rouge_score import RougeScore @@ -29,12 +31,14 @@ __all__ = [ "BaseMetric", # Base class + "AnswerAccuracy", "AnswerCorrectness", "AnswerRelevancy", "AnswerSimilarity", "AspectCritic", "BleuScore", "ContextEntityRecall", + "ContextRelevance", "DistanceMeasure", "ExactMatch", "Faithfulness", diff --git a/src/ragas/metrics/collections/_answer_accuracy.py b/src/ragas/metrics/collections/_answer_accuracy.py new file mode 100644 index 000000000..85050b4b1 --- /dev/null +++ b/src/ragas/metrics/collections/_answer_accuracy.py @@ -0,0 +1,171 @@ +"""Answer Accuracy metric v2 - Modern implementation with dual-judge evaluation.""" + +import typing as t + +import numpy as np +from pydantic import BaseModel + +from ragas.metrics.collections.base import BaseMetric +from ragas.metrics.result import MetricResult +from ragas.prompt.metrics.answer_accuracy import ( + answer_accuracy_judge1_prompt, + answer_accuracy_judge2_prompt, +) + +if t.TYPE_CHECKING: + from ragas.llms.base import InstructorBaseRagasLLM + + +class JudgeRating(BaseModel): + """Structured output for judge rating.""" + + rating: int + + +class AnswerAccuracy(BaseMetric): + """ + Modern v2 implementation of answer accuracy evaluation. + + Measures answer accuracy compared to ground truth using a dual-judge system. + This metric averages two distinct judge prompts to ensure robust evaluation. + + The metric uses NVIDIA's proven dual-judge approach: + 1. Judge 1: Direct User Answer vs Reference Answer comparison + 2. Judge 2: Swapped perspective for fairness + 3. Average both judges for final score + + Rating scale: 0 (no match), 2 (partial match), 4 (exact match) + Final score: Average of both judges converted to 0.0-1.0 scale + + Usage: + >>> import instructor + >>> from openai import AsyncOpenAI + >>> from ragas.llms.base import instructor_llm_factory + >>> from ragas.metrics.collections import AnswerAccuracy + >>> + >>> # Setup dependencies + >>> client = AsyncOpenAI() + >>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o") + >>> + >>> # Create metric instance + >>> metric = AnswerAccuracy(llm=llm) + >>> + >>> # Single evaluation + >>> result = await metric.ascore( + ... user_input="When was Einstein born?", + ... response="Albert Einstein was born in 1879.", + ... reference="Albert Einstein was born in 1879." + ... ) + >>> print(f"Answer Accuracy: {result.value}") + + Attributes: + llm: Modern instructor-based LLM for dual-judge evaluation + name: The metric name + allowed_values: Score range (0.0 to 1.0, higher is better) + max_retries: Maximum retry attempts for invalid ratings + """ + + # Type hints for linter (attributes are set in __init__) + llm: "InstructorBaseRagasLLM" + + def __init__( + self, + llm: "InstructorBaseRagasLLM", + name: str = "answer_accuracy", + max_retries: int = 5, + **kwargs, + ): + """ + Initialize AnswerAccuracy metric with required components. + + Args: + llm: Modern instructor-based LLM for dual-judge evaluation + name: The metric name + max_retries: Maximum retry attempts for invalid ratings + """ + # Set attributes explicitly before calling super() + self.llm = llm + self.max_retries = max_retries + + # Call super() for validation (without passing llm in kwargs) + super().__init__(name=name, **kwargs) + + async def ascore( + self, user_input: str, response: str, reference: str + ) -> MetricResult: + """ + Calculate answer accuracy score using dual-judge evaluation. + + Args: + user_input: The original question + response: The user's answer to evaluate + reference: The ground truth reference answer + + Returns: + MetricResult with answer accuracy score (0.0-1.0, higher is better) + """ + # Input validation + if not user_input: + raise ValueError( + "user_input is missing. Please add user_input to the test sample." + ) + if not response: + raise ValueError( + "response is missing. Please add response to the test sample." + ) + if not reference: + raise ValueError( + "reference is missing. Please add reference to the test sample." + ) + + # Get ratings from both judges with NVIDIA temperature (0.1) + judge1_rating = await self._get_judge_rating( + answer_accuracy_judge1_prompt(user_input, response, reference) + ) + judge2_rating = await self._get_judge_rating( + answer_accuracy_judge2_prompt( + user_input, reference, response + ) # Note: swapped order + ) + + # Average the scores (convert from 0,2,4 scale to 0.0-1.0) + score = self._average_scores(judge1_rating / 4.0, judge2_rating / 4.0) + + return MetricResult(value=float(score)) + + async def _get_judge_rating(self, prompt: str) -> float: + """Get rating from judge using structured JSON output.""" + for retry in range(self.max_retries): + try: + # Use structured output with JSON - clean and reliable + result = await self.llm.agenerate(prompt, JudgeRating) + rating = result.rating + + # Validate rating is in expected range + if rating in [0, 2, 4]: + return float(rating) + else: + # Invalid rating - retry or return NaN + if retry < self.max_retries - 1: + continue # Retry if invalid rating + else: + return float("nan") + + except Exception: + if retry < self.max_retries - 1: + continue # Retry on exception + else: + return float("nan") + + return float("nan") + + def _average_scores(self, score1: float, score2: float) -> float: + """Average two judge scores, handling NaN values.""" + if not np.isnan(score1) and not np.isnan(score2): + return (score1 + score2) / 2.0 + elif not np.isnan(score1): + return score1 + elif not np.isnan(score2): + return score2 + else: + return float("nan") diff --git a/src/ragas/metrics/collections/_context_relevance.py b/src/ragas/metrics/collections/_context_relevance.py new file mode 100644 index 000000000..567d53262 --- /dev/null +++ b/src/ragas/metrics/collections/_context_relevance.py @@ -0,0 +1,177 @@ +"""Context Relevance metric v2 - Modern implementation with dual-judge evaluation.""" + +import typing as t +from typing import List + +import numpy as np +from pydantic import BaseModel + +from ragas.metrics.collections.base import BaseMetric +from ragas.metrics.result import MetricResult +from ragas.prompt.metrics.context_relevance import ( + context_relevance_judge1_prompt, + context_relevance_judge2_prompt, +) + +if t.TYPE_CHECKING: + from ragas.llms.base import InstructorBaseRagasLLM + + +class RelevanceRating(BaseModel): + """Structured output for relevance rating.""" + + rating: int + + +class ContextRelevance(BaseMetric): + """ + Modern v2 implementation of context relevance evaluation. + + Evaluates whether the retrieved contexts are pertinent to the user input + using a dual-judge system. This metric averages two distinct judge prompts + to ensure robust evaluation. + + The metric uses NVIDIA's proven dual-judge approach: + 1. Judge 1: Direct context relevance evaluation + 2. Judge 2: Alternative perspective for fairness + 3. Average both judges for final score + + Rating scale: 0 (not relevant), 1 (partially relevant), 2 (fully relevant) + Final score: Average of both judges converted to 0.0-1.0 scale + + Usage: + >>> import instructor + >>> from openai import AsyncOpenAI + >>> from ragas.llms.base import instructor_llm_factory + >>> from ragas.metrics.collections import ContextRelevance + >>> + >>> # Setup dependencies + >>> client = AsyncOpenAI() + >>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o") + >>> + >>> # Create metric instance + >>> metric = ContextRelevance(llm=llm) + >>> + >>> # Single evaluation + >>> result = await metric.ascore( + ... user_input="When was Einstein born?", + ... retrieved_contexts=["Albert Einstein was born March 14, 1879."] + ... ) + >>> print(f"Context Relevance: {result.value}") + + Attributes: + llm: Modern instructor-based LLM for dual-judge evaluation + name: The metric name + allowed_values: Score range (0.0 to 1.0, higher is better) + max_retries: Maximum retry attempts for invalid ratings + """ + + # Type hints for linter (attributes are set in __init__) + llm: "InstructorBaseRagasLLM" + + def __init__( + self, + llm: "InstructorBaseRagasLLM", + name: str = "context_relevance", + max_retries: int = 5, + **kwargs, + ): + """ + Initialize ContextRelevance metric with required components. + + Args: + llm: Modern instructor-based LLM for dual-judge evaluation + name: The metric name + max_retries: Maximum retry attempts for invalid ratings + """ + # Set attributes explicitly before calling super() + self.llm = llm + self.max_retries = max_retries + + # Call super() for validation (without passing llm in kwargs) + super().__init__(name=name, **kwargs) + + async def ascore( + self, user_input: str, retrieved_contexts: List[str] + ) -> MetricResult: + """ + Calculate context relevance score using dual-judge evaluation. + + Args: + user_input: The original question + retrieved_contexts: The retrieved contexts to evaluate for relevance + + Returns: + MetricResult with context relevance score (0.0-1.0, higher is better) + """ + # Input validation + if not user_input: + raise ValueError( + "user_input is missing. Please add user_input to the test sample." + ) + if not retrieved_contexts: + raise ValueError( + "retrieved_contexts is missing. Please add retrieved_contexts to the test sample." + ) + + # Handle edge cases like legacy + context_str = "\n".join(retrieved_contexts) + + if not user_input.strip() or not context_str.strip(): + return MetricResult(value=0.0) + + # Edge case: if user input matches context exactly + if user_input.strip() == context_str.strip(): + return MetricResult(value=0.0) + + # Edge case: if context is contained in user input + if context_str.strip() in user_input.strip(): + return MetricResult(value=0.0) + + # Get ratings from both judges with NVIDIA temperature (0.1) + judge1_rating = await self._get_judge_rating( + context_relevance_judge1_prompt(user_input, context_str) + ) + judge2_rating = await self._get_judge_rating( + context_relevance_judge2_prompt(user_input, context_str) + ) + + # Average the scores (convert from 0,1,2 scale to 0.0-1.0) + score = self._average_scores(judge1_rating / 2.0, judge2_rating / 2.0) + + return MetricResult(value=float(score)) + + async def _get_judge_rating(self, prompt: str) -> float: + """Get rating from judge with retry logic and NVIDIA temperature.""" + for retry in range(self.max_retries): + try: + result = await self.llm.agenerate(prompt, RelevanceRating) + rating = result.rating + + # Validate rating is in expected range + if rating in [0, 1, 2]: + return float(rating) + else: + if retry < self.max_retries - 1: + continue # Retry if invalid rating + else: + return float("nan") + + except Exception: + if retry < self.max_retries - 1: + continue # Retry on exception + else: + return float("nan") + + return float("nan") + + def _average_scores(self, score1: float, score2: float) -> float: + """Average two judge scores, handling NaN values.""" + if not np.isnan(score1) and not np.isnan(score2): + return (score1 + score2) / 2.0 + elif not np.isnan(score1): + return score1 + elif not np.isnan(score2): + return score2 + else: + return float("nan") diff --git a/src/ragas/prompt/metrics/answer_accuracy.py b/src/ragas/prompt/metrics/answer_accuracy.py new file mode 100644 index 000000000..006627e1a --- /dev/null +++ b/src/ragas/prompt/metrics/answer_accuracy.py @@ -0,0 +1,73 @@ +"""Answer Accuracy prompts - Convert NVIDIA dual-judge templates to function format.""" + +import json + + +def answer_accuracy_judge1_prompt( + query: str, user_answer: str, reference_answer: str +) -> str: + """ + First judge template for answer accuracy evaluation. + + Uses JSON structured output for reliable parsing. + + Args: + query: The original question + user_answer: The response to evaluate + reference_answer: The ground truth reference + + Returns: + Prompt string for structured JSON rating (0, 2, or 4) + """ + safe_query = json.dumps(query) + safe_user_answer = json.dumps(user_answer) + safe_reference_answer = json.dumps(reference_answer) + + return f"""Instruction: You are a world class state of the art assistant for rating a User Answer given a Question. The Question is completely answered by the Reference Answer. +Say 4, if User Answer is full contained and equivalent to Reference Answer in all terms, topics, numbers, metrics, dates and units. +Say 2, if User Answer is partially contained and almost equivalent to Reference Answer in all terms, topics, numbers, metrics, dates and units. +Say 0, if User Answer is not contained in Reference Answer or not accurate in all terms, topics, numbers, metrics, dates and units or the User Answer do not answer the question. +Do not explain or justify your rating. Your rating must be only 4, 2 or 0 according to the instructions above. +Return your response as JSON in this format: {{"rating": X}} where X is 0, 2, or 4. + +### Question: {safe_query} +### User Answer: {safe_user_answer} +### Reference Answer: {safe_reference_answer} +The rating is:""" + + +def answer_accuracy_judge2_prompt( + query: str, user_answer: str, reference_answer: str +) -> str: + """ + Second judge template for answer accuracy evaluation. + + Uses JSON structured output for reliable parsing. + + Args: + query: The original question + user_answer: The response to evaluate + reference_answer: The ground truth reference + + Returns: + Prompt string for structured JSON rating (0, 2, or 4) + """ + safe_query = json.dumps(query) + safe_user_answer = json.dumps(user_answer) + safe_reference_answer = json.dumps(reference_answer) + + return f"""I will rate the User Answer in comparison to the Reference Answer for a given Question. +A rating of 4 indicates that the User Answer is entirely consistent with the Reference Answer, covering all aspects, topics, numbers, metrics, dates, and units. +A rating of 2 signifies that the User Answer is mostly aligned with the Reference Answer, with minor discrepancies in some areas. +A rating of 0 means that the User Answer is either inaccurate, incomplete, or unrelated to the Reference Answer, or it fails to address the Question. +I will provide the rating without any explanation or justification, adhering to the following scale: 0 (no match), 2 (partial match), 4 (exact match). +Do not explain or justify my rating. My rating must be only 4, 2 or 0 only. +Return your response as JSON in this format: {{"rating": X}} where X is 0, 2, or 4. + +Question: {safe_query} + +Reference Answer: {safe_reference_answer} + +User Answer: {safe_user_answer} + +Rating: """ diff --git a/src/ragas/prompt/metrics/context_relevance.py b/src/ragas/prompt/metrics/context_relevance.py new file mode 100644 index 000000000..4431dee8f --- /dev/null +++ b/src/ragas/prompt/metrics/context_relevance.py @@ -0,0 +1,69 @@ +"""Context Relevance prompts - Convert NVIDIA dual-judge templates to function format.""" + +import json + + +def context_relevance_judge1_prompt(query: str, context: str) -> str: + """ + First judge template for context relevance evaluation. + + Args: + query: The user's question + context: The retrieved context to evaluate + + Returns: + Prompt string for rating (0, 1, or 2) + """ + safe_query = json.dumps(query) + safe_context = json.dumps(context) + + return f"""### Instructions + +You are a world class expert designed to evaluate the relevance score of a Context in order to answer the Question. +Your task is to determine if the Context contains proper information to answer the Question. +Do not rely on your previous knowledge about the Question. +Use only what is written in the Context and in the Question. +Follow the instructions below: +0. If the context does not contains any relevant information to answer the question, say 0. +1. If the context partially contains relevant information to answer the question, say 1. +2. If the context contains any relevant information to answer the question, say 2. +You must provide the relevance score of 0, 1, or 2, nothing else. +Do not explain. +Return your response as JSON in this format: {{"rating": X}} where X is 0, 1, or 2. + +### Question: {safe_query} + +### Context: {safe_context} + +Do not try to explain. +Analyzing Context and Question, the Relevance score is """ + + +def context_relevance_judge2_prompt(query: str, context: str) -> str: + """ + Second judge template for context relevance evaluation. + + Args: + query: The user's question + context: The retrieved context to evaluate + + Returns: + Prompt string for rating (0, 1, or 2) + """ + safe_query = json.dumps(query) + safe_context = json.dumps(context) + + return f"""As a specially designed expert to assess the relevance score of a given Context in relation to a Question, my task is to determine the extent to which the Context provides information necessary to answer the Question. I will rely solely on the information provided in the Context and Question, and not on any prior knowledge. + +Here are the instructions I will follow: +* If the Context does not contain any relevant information to answer the Question, I will respond with a relevance score of 0. +* If the Context partially contains relevant information to answer the Question, I will respond with a relevance score of 1. +* If the Context contains any relevant information to answer the Question, I will respond with a relevance score of 2. +Return your response as JSON in this format: {{"rating": X}} where X is 0, 1, or 2. + +### Question: {safe_query} + +### Context: {safe_context} + +Do not try to explain. +Based on the provided Question and Context, the Relevance score is [""" diff --git a/tests/e2e/metrics_migration/test_answer_accuracy_migration.py b/tests/e2e/metrics_migration/test_answer_accuracy_migration.py new file mode 100644 index 000000000..480a3ee2f --- /dev/null +++ b/tests/e2e/metrics_migration/test_answer_accuracy_migration.py @@ -0,0 +1,201 @@ +"""E2E tests for Answer Accuracy metric migration from v1 to v2.""" + +import numpy as np +import pytest + +from ragas.dataset_schema import SingleTurnSample +from ragas.metrics._nv_metrics import AnswerAccuracy as LegacyAnswerAccuracy +from ragas.metrics.collections import AnswerAccuracy + + +# NVIDIA-specific fixtures with correct temperature (0.1) +@pytest.fixture +def nvidia_legacy_llm(): + """Create legacy LLM for AnswerAccuracy (temperature set in metric calls).""" + try: + from langchain_openai import ChatOpenAI + + from ragas.llms.base import LangchainLLMWrapper + + # Legacy sets temperature=0.1 in the metric calls, so use default here + langchain_llm = ChatOpenAI(model="gpt-4o", temperature=0.01) + return LangchainLLMWrapper(langchain_llm) + except Exception as e: + pytest.skip(str(e)) + + +@pytest.fixture +def nvidia_modern_llm(): + """Create modern LLM with NVIDIA temperature (0.1) for AnswerAccuracy.""" + try: + import openai + + from ragas.llms.base import instructor_llm_factory + + client = openai.AsyncOpenAI() + # Set temperature=0.1 to match legacy NVIDIA calls exactly + return instructor_llm_factory( + "openai", model="gpt-4o", client=client, temperature=0.1 + ) + except Exception as e: + pytest.skip(str(e)) + + +class TestAnswerAccuracyE2EMigration: + """E2E test compatibility between legacy AnswerAccuracy and new V2 AnswerAccuracy with modern components.""" + + @pytest.fixture + def sample_data(self): + """Real-world test cases for answer accuracy evaluation.""" + return [ + { + "user_input": "When was Einstein born?", + "response": "Albert Einstein was born in 1879.", + "reference": "Albert Einstein was born in 1879.", + "description": "Exact match - should score high", + }, + { + "user_input": "When was Einstein born?", + "response": "Albert Einstein was born on March 14, 1879.", + "reference": "Albert Einstein was born in 1879.", + "description": "Partial match - additional correct details", + }, + { + "user_input": "When was Einstein born?", + "response": "Albert Einstein was born in 1885.", + "reference": "Albert Einstein was born in 1879.", + "description": "Incorrect answer - wrong year", + }, + { + "user_input": "What is photosynthesis?", + "response": "Photosynthesis is how plants make energy.", + "reference": "Photosynthesis is the process by which plants convert sunlight into chemical energy using chlorophyll.", + "description": "Incomplete but correct summary", + }, + ] + + @pytest.fixture + def test_llm(self): + """Create a test LLM for legacy answer accuracy evaluation.""" + try: + from ragas.llms.base import llm_factory + + return llm_factory("gpt-4o") + except ImportError as e: + pytest.skip(f"LLM factory not available: {e}") + except Exception as e: + pytest.skip(f"Could not create LLM (API key may be missing): {e}") + + @pytest.fixture + def test_modern_llm(self): + """Create a modern instructor LLM for v2 implementation.""" + try: + import openai + + from ragas.llms.base import llm_factory + + client = openai.AsyncOpenAI() + return llm_factory( + model="gpt-4o", + provider="openai", + client=client, + ) + except ImportError as e: + pytest.skip(f"Instructor LLM factory not available: {e}") + except Exception as e: + pytest.skip(f"Could not create modern LLM (API key may be missing): {e}") + + @pytest.mark.asyncio + async def test_legacy_answer_accuracy_vs_v2_answer_accuracy_e2e_compatibility( + self, sample_data, nvidia_legacy_llm, nvidia_modern_llm + ): + """E2E test that legacy and v2 implementations produce similar scores.""" + + if nvidia_legacy_llm is None or nvidia_modern_llm is None: + pytest.skip("LLM required for E2E testing") + + for i, data in enumerate(sample_data): + print(f"\n🧪 Testing Answer Accuracy - Case {i + 1}: {data['description']}") + print(f" Question: {data['user_input']}") + print(f" Response: {data['response']}") + print(f" Reference: {data['reference']}") + + # Legacy implementation + legacy_answer_accuracy = LegacyAnswerAccuracy(llm=nvidia_legacy_llm) + legacy_sample = SingleTurnSample( + user_input=data["user_input"], + response=data["response"], + reference=data["reference"], + ) + legacy_score = await legacy_answer_accuracy._single_turn_ascore( + legacy_sample, None + ) + + # V2 implementation + v2_answer_accuracy = AnswerAccuracy(llm=nvidia_modern_llm) + v2_result = await v2_answer_accuracy.ascore( + user_input=data["user_input"], + response=data["response"], + reference=data["reference"], + ) + + score_diff = ( + abs(legacy_score - v2_result.value) + if not np.isnan(legacy_score) and not np.isnan(v2_result.value) + else 0.0 + ) + print(f" Legacy: {legacy_score:.6f}") + print(f" V2: {v2_result.value:.6f}") + print(f" Diff: {score_diff:.6f}") + + # Both implementations use dual judges with same prompts and temperature + # Some variance expected due to Langchain vs Instructor interface differences + if not np.isnan(legacy_score) and not np.isnan(v2_result.value): + assert score_diff < 0.6, ( + f"Legacy and V2 scores should be reasonably similar: Legacy={legacy_score:.6f}, " + f"V2={v2_result.value:.6f}, Diff={score_diff:.6f} (tolerance: 0.6)" + ) + print(" ✅ Both implementations give consistent scores") + else: + print(" ℹ️ One or both scores are NaN - edge case handling") + + # Validate score ranges (should be 0-1 or NaN) + if not np.isnan(legacy_score): + assert 0.0 <= legacy_score <= 1.0 + if not np.isnan(v2_result.value): + assert 0.0 <= v2_result.value <= 1.0 + + @pytest.mark.asyncio + async def test_answer_accuracy_dual_judge_system(self, test_modern_llm): + """Test that v2 implementation correctly uses dual-judge system.""" + + if test_modern_llm is None: + pytest.skip("Modern LLM required for dual-judge testing") + + metric = AnswerAccuracy(llm=test_modern_llm) + + # Test case where both judges should agree + result = await metric.ascore( + user_input="What is 2+2?", + response="2+2 equals 4.", + reference="2+2 equals 4.", + ) + + print(f"Dual-judge result: {result.value:.3f}") + + # Should be high score for exact match + if not np.isnan(result.value): + assert 0.5 <= result.value <= 1.0, ( + f"Expected high score for exact match, got {result.value}" + ) + + def test_answer_accuracy_migration_requirements_documented(self): + """Test that migration requirements are properly documented.""" + + # V2 implementation should not accept legacy components + with pytest.raises((TypeError, ValueError, AttributeError)): + AnswerAccuracy(llm="invalid_llm_type") # Should reject string + + # V2 should only accept InstructorBaseRagasLLM + with pytest.raises((TypeError, ValueError, AttributeError)): + AnswerAccuracy(llm=None) # Should reject None diff --git a/tests/e2e/metrics_migration/test_context_relevance_migration.py b/tests/e2e/metrics_migration/test_context_relevance_migration.py new file mode 100644 index 000000000..8f33a960a --- /dev/null +++ b/tests/e2e/metrics_migration/test_context_relevance_migration.py @@ -0,0 +1,230 @@ +"""E2E tests for Context Relevance metric migration from v1 to v2.""" + +import numpy as np +import pytest + +from ragas.dataset_schema import SingleTurnSample +from ragas.metrics._nv_metrics import ContextRelevance as LegacyContextRelevance +from ragas.metrics.collections import ContextRelevance + + +# NVIDIA-specific fixtures with correct temperature (0.1) +@pytest.fixture +def nvidia_legacy_llm(): + """Create legacy LLM for ContextRelevance (temperature set in metric calls).""" + try: + from langchain_openai import ChatOpenAI + + from ragas.llms.base import LangchainLLMWrapper + + # Legacy sets temperature=0.1 in the metric calls, so use default here + langchain_llm = ChatOpenAI(model="gpt-4o", temperature=0.01) + return LangchainLLMWrapper(langchain_llm) + except Exception as e: + pytest.skip(str(e)) + + +@pytest.fixture +def nvidia_modern_llm(): + """Create modern LLM with NVIDIA temperature (0.1) for ContextRelevance.""" + try: + import openai + + from ragas.llms.base import llm_factory + + client = openai.AsyncOpenAI() + # Set temperature=0.1 to match legacy NVIDIA calls exactly + return llm_factory( + model="gpt-4o", provider="openai", client=client, temperature=0.1 + ) + except Exception as e: + pytest.skip(str(e)) + + +class TestContextRelevanceE2EMigration: + """E2E test compatibility between legacy ContextRelevance and new V2 ContextRelevance with modern components.""" + + @pytest.fixture + def sample_data(self): + """Real-world test cases for context relevance evaluation.""" + return [ + { + "user_input": "When and where was Albert Einstein born?", + "retrieved_contexts": [ + "Albert Einstein was born March 14, 1879.", + "Albert Einstein was born at Ulm, in Württemberg, Germany.", + ], + "description": "Fully relevant contexts - should score high", + }, + { + "user_input": "What is photosynthesis?", + "retrieved_contexts": [ + "Photosynthesis is the process by which plants convert sunlight into energy.", + "Albert Einstein developed the theory of relativity.", + ], + "description": "Partially relevant contexts - mixed relevance", + }, + { + "user_input": "How do computers work?", + "retrieved_contexts": [ + "Albert Einstein was a theoretical physicist.", + "The weather today is sunny and warm.", + ], + "description": "Irrelevant contexts - should score low", + }, + { + "user_input": "What is machine learning?", + "retrieved_contexts": [ + "Machine learning is a subset of artificial intelligence that enables computers to learn and improve automatically.", + ], + "description": "Single highly relevant context", + }, + ] + + @pytest.fixture + def test_llm(self): + """Create a test LLM for legacy context relevance evaluation.""" + try: + from ragas.llms.base import llm_factory + + return llm_factory("gpt-4o") + except ImportError as e: + pytest.skip(f"LLM factory not available: {e}") + except Exception as e: + pytest.skip(f"Could not create LLM (API key may be missing): {e}") + + @pytest.fixture + def test_modern_llm(self): + """Create a modern instructor LLM for v2 implementation.""" + try: + import openai + + from ragas.llms.base import llm_factory + + client = openai.AsyncOpenAI() + return llm_factory( + model="gpt-4o", + provider="openai", + client=client, + ) + except ImportError as e: + pytest.skip(f"Instructor LLM factory not available: {e}") + except Exception as e: + pytest.skip(f"Could not create modern LLM (API key may be missing): {e}") + + @pytest.mark.asyncio + async def test_legacy_context_relevance_vs_v2_context_relevance_e2e_compatibility( + self, sample_data, nvidia_legacy_llm, nvidia_modern_llm + ): + """E2E test that legacy and v2 implementations produce similar scores.""" + + if nvidia_legacy_llm is None or nvidia_modern_llm is None: + pytest.skip("LLM required for E2E testing") + + for i, data in enumerate(sample_data): + print( + f"\n🧪 Testing Context Relevance - Case {i + 1}: {data['description']}" + ) + print(f" Question: {data['user_input']}") + print(f" Contexts: {len(data['retrieved_contexts'])} context(s)") + for j, ctx in enumerate(data["retrieved_contexts"]): + print(f" {j + 1}. {ctx[:60]}...") + + # Legacy implementation + legacy_context_relevance = LegacyContextRelevance(llm=nvidia_legacy_llm) + legacy_sample = SingleTurnSample( + user_input=data["user_input"], + retrieved_contexts=data["retrieved_contexts"], + ) + legacy_score = await legacy_context_relevance._single_turn_ascore( + legacy_sample, None + ) + + # V2 implementation + v2_context_relevance = ContextRelevance(llm=nvidia_modern_llm) + v2_result = await v2_context_relevance.ascore( + user_input=data["user_input"], + retrieved_contexts=data["retrieved_contexts"], + ) + + score_diff = ( + abs(legacy_score - v2_result.value) + if not np.isnan(legacy_score) and not np.isnan(v2_result.value) + else 0.0 + ) + print(f" Legacy: {legacy_score:.6f}") + print(f" V2: {v2_result.value:.6f}") + print(f" Diff: {score_diff:.6f}") + + # Both implementations use dual judges with same temperature=0.1 - should be identical + if not np.isnan(legacy_score) and not np.isnan(v2_result.value): + assert score_diff < 0.01, ( + f"Legacy and V2 scores should be nearly identical: Legacy={legacy_score:.6f}, " + f"V2={v2_result.value:.6f}, Diff={score_diff:.6f} (tolerance: 0.01)" + ) + print(" ✅ Both implementations give consistent scores") + else: + print(" ℹ️ One or both scores are NaN - edge case handling") + + # Validate score ranges (should be 0-1 or NaN) + if not np.isnan(legacy_score): + assert 0.0 <= legacy_score <= 1.0 + if not np.isnan(v2_result.value): + assert 0.0 <= v2_result.value <= 1.0 + + @pytest.mark.asyncio + async def test_context_relevance_edge_cases(self, test_modern_llm): + """Test edge cases like empty contexts and queries.""" + + if test_modern_llm is None: + pytest.skip("Modern LLM required for edge case testing") + + metric = ContextRelevance(llm=test_modern_llm) + + # Test empty user input + with pytest.raises(ValueError, match="user_input is missing"): + await metric.ascore( + user_input="", + retrieved_contexts=["Some context."], + ) + + # Test empty contexts + with pytest.raises(ValueError, match="retrieved_contexts is missing"): + await metric.ascore( + user_input="What is AI?", + retrieved_contexts=[], + ) + + @pytest.mark.asyncio + async def test_context_relevance_dual_judge_system(self, test_modern_llm): + """Test that v2 implementation correctly uses dual-judge system.""" + + if test_modern_llm is None: + pytest.skip("Modern LLM required for dual-judge testing") + + metric = ContextRelevance(llm=test_modern_llm) + + # Test case where context is clearly relevant + result = await metric.ascore( + user_input="What is the capital of France?", + retrieved_contexts=["Paris is the capital of France and its largest city."], + ) + + print(f"Dual-judge relevance result: {result.value:.3f}") + + # Should be high score for relevant context + if not np.isnan(result.value): + assert 0.5 <= result.value <= 1.0, ( + f"Expected high score for relevant context, got {result.value}" + ) + + def test_context_relevance_migration_requirements_documented(self): + """Test that migration requirements are properly documented.""" + + # V2 implementation should not accept legacy components + with pytest.raises((TypeError, ValueError, AttributeError)): + ContextRelevance(llm="invalid_llm_type") # Should reject string + + # V2 should only accept InstructorBaseRagasLLM + with pytest.raises((TypeError, ValueError, AttributeError)): + ContextRelevance(llm=None) # Should reject None