explodinggradients · anistark · Nov 4, 2025 · Oct 20, 2025 · Oct 20, 2025 · Oct 22, 2025
diff --git a/src/ragas/metrics/collections/__init__.py b/src/ragas/metrics/collections/__init__.py
@@ -1,5 +1,6 @@
 """Collections of metrics using modern component architecture."""
 
+from ragas.metrics.collections._answer_accuracy import AnswerAccuracy
 from ragas.metrics.collections._answer_correctness import AnswerCorrectness
 from ragas.metrics.collections._answer_relevancy import AnswerRelevancy
 from ragas.metrics.collections._answer_similarity import AnswerSimilarity
@@ -29,12 +30,14 @@
 
 __all__ = [
     "BaseMetric",  # Base class
+    "AnswerAccuracy",
     "AnswerCorrectness",
     "AnswerRelevancy",
     "AnswerSimilarity",
     "AspectCritic",
     "BleuScore",
     "ContextEntityRecall",
+    "ContextRelevance",
     "DistanceMeasure",
     "ExactMatch",
     "Faithfulness",

diff --git a/src/ragas/metrics/collections/_answer_accuracy.py b/src/ragas/metrics/collections/_answer_accuracy.py
@@ -0,0 +1,171 @@
+"""Answer Accuracy metric v2 - Modern implementation with dual-judge evaluation."""
+
+import typing as t
+
+import numpy as np
+from pydantic import BaseModel
+
+from ragas.metrics.collections.base import BaseMetric
+from ragas.metrics.result import MetricResult
+from ragas.prompt.metrics.answer_accuracy import (
+    answer_accuracy_judge1_prompt,
+    answer_accuracy_judge2_prompt,
+)
+
+if t.TYPE_CHECKING:
+    from ragas.llms.base import InstructorBaseRagasLLM
+
+
+class JudgeRating(BaseModel):
+    """Structured output for judge rating."""
+
+    rating: int
+
+
+class AnswerAccuracy(BaseMetric):
+    """
+    Modern v2 implementation of answer accuracy evaluation.
+
+    Measures answer accuracy compared to ground truth using a dual-judge system.
+    This metric averages two distinct judge prompts to ensure robust evaluation.
+
+    The metric uses NVIDIA's proven dual-judge approach:
+    1. Judge 1: Direct User Answer vs Reference Answer comparison
+    2. Judge 2: Swapped perspective for fairness
+    3. Average both judges for final score
+
+    Rating scale: 0 (no match), 2 (partial match), 4 (exact match)
+    Final score: Average of both judges converted to 0.0-1.0 scale
+
+    Usage:
+        >>> import instructor
+        >>> from openai import AsyncOpenAI
+        >>> from ragas.llms.base import instructor_llm_factory
+        >>> from ragas.metrics.collections import AnswerAccuracy
+        >>>
+        >>> # Setup dependencies
+        >>> client = AsyncOpenAI()
+        >>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o")
+        >>>
+        >>> # Create metric instance
+        >>> metric = AnswerAccuracy(llm=llm)
+        >>>
+        >>> # Single evaluation
+        >>> result = await metric.ascore(
+        ...     user_input="When was Einstein born?",
+        ...     response="Albert Einstein was born in 1879.",
+        ...     reference="Albert Einstein was born in 1879."
+        ... )
+        >>> print(f"Answer Accuracy: {result.value}")
+
+    Attributes:
+        llm: Modern instructor-based LLM for dual-judge evaluation
+        name: The metric name
+        allowed_values: Score range (0.0 to 1.0, higher is better)
+        max_retries: Maximum retry attempts for invalid ratings
+    """
+
+    # Type hints for linter (attributes are set in __init__)
+    llm: "InstructorBaseRagasLLM"
+
+    def __init__(
+        self,
+        llm: "InstructorBaseRagasLLM",
+        name: str = "answer_accuracy",
+        max_retries: int = 5,
+        **kwargs,
+    ):
+        """
+        Initialize AnswerAccuracy metric with required components.
+
+        Args:
+            llm: Modern instructor-based LLM for dual-judge evaluation
+            name: The metric name
+            max_retries: Maximum retry attempts for invalid ratings
+        """
+        # Set attributes explicitly before calling super()
+        self.llm = llm
+        self.max_retries = max_retries
+
+        # Call super() for validation (without passing llm in kwargs)
+        super().__init__(name=name, **kwargs)
+
+    async def ascore(
+        self, user_input: str, response: str, reference: str
+    ) -> MetricResult:
+        """
+        Calculate answer accuracy score using dual-judge evaluation.
+
+        Args:
+            user_input: The original question
+            response: The user's answer to evaluate
+            reference: The ground truth reference answer
+
+        Returns:
+            MetricResult with answer accuracy score (0.0-1.0, higher is better)
+        """
+        # Input validation
+        if not user_input:
+            raise ValueError(
+                "user_input is missing. Please add user_input to the test sample."
+            )
+        if not response:
+            raise ValueError(
+                "response is missing. Please add response to the test sample."
+            )
+        if not reference:
+            raise ValueError(
+                "reference is missing. Please add reference to the test sample."
+            )
+
+        # Get ratings from both judges with NVIDIA temperature (0.1)
+        judge1_rating = await self._get_judge_rating(
+            answer_accuracy_judge1_prompt(user_input, response, reference)
+        )
+        judge2_rating = await self._get_judge_rating(
+            answer_accuracy_judge2_prompt(
+                user_input, reference, response
+            )  # Note: swapped order
+        )
+
+        # Average the scores (convert from 0,2,4 scale to 0.0-1.0)
+        score = self._average_scores(judge1_rating / 4.0, judge2_rating / 4.0)
+
+        return MetricResult(value=float(score))
+
+    async def _get_judge_rating(self, prompt: str) -> float:
+        """Get rating from judge using structured JSON output."""
+        for retry in range(self.max_retries):
+            try:
+                # Use structured output with JSON - clean and reliable
+                result = await self.llm.agenerate(prompt, JudgeRating)
+                rating = result.rating
+
+                # Validate rating is in expected range
+                if rating in [0, 2, 4]:
+                    return float(rating)
+                else:
+                    # Invalid rating - retry or return NaN
+                    if retry < self.max_retries - 1:
+                        continue  # Retry if invalid rating
+                    else:
+                        return float("nan")
+
+            except Exception:
+                if retry < self.max_retries - 1:
+                    continue  # Retry on exception
+                else:
+                    return float("nan")
+
+        return float("nan")
+
+    def _average_scores(self, score1: float, score2: float) -> float:
+        """Average two judge scores, handling NaN values."""
+        if not np.isnan(score1) and not np.isnan(score2):
+            return (score1 + score2) / 2.0
+        elif not np.isnan(score1):
+            return score1
+        elif not np.isnan(score2):
+            return score2
+        else:
+            return float("nan")
diff --git a/src/ragas/metrics/collections/_context_relevance.py b/src/ragas/metrics/collections/_context_relevance.py
@@ -0,0 +1,177 @@
+"""Context Relevance metric v2 - Modern implementation with dual-judge evaluation."""
+
+import typing as t
+from typing import List
+
+import numpy as np
+from pydantic import BaseModel
+
+from ragas.metrics.collections.base import BaseMetric
+from ragas.metrics.result import MetricResult
+from ragas.prompt.metrics.context_relevance import (
+    context_relevance_judge1_prompt,
+    context_relevance_judge2_prompt,
+)
+
+if t.TYPE_CHECKING:
+    from ragas.llms.base import InstructorBaseRagasLLM
+
+
+class RelevanceRating(BaseModel):
+    """Structured output for relevance rating."""
+
+    rating: int
+
+
+class ContextRelevance(BaseMetric):
+    """
+    Modern v2 implementation of context relevance evaluation.
+
+    Evaluates whether the retrieved contexts are pertinent to the user input
+    using a dual-judge system. This metric averages two distinct judge prompts
+    to ensure robust evaluation.
+
+    The metric uses NVIDIA's proven dual-judge approach:
+    1. Judge 1: Direct context relevance evaluation
+    2. Judge 2: Alternative perspective for fairness
+    3. Average both judges for final score
+
+    Rating scale: 0 (not relevant), 1 (partially relevant), 2 (fully relevant)
+    Final score: Average of both judges converted to 0.0-1.0 scale
+
+    Usage:
+        >>> import instructor
+        >>> from openai import AsyncOpenAI
+        >>> from ragas.llms.base import instructor_llm_factory
+        >>> from ragas.metrics.collections import ContextRelevance
+        >>>
+        >>> # Setup dependencies
+        >>> client = AsyncOpenAI()
+        >>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o")
+        >>>
+        >>> # Create metric instance
+        >>> metric = ContextRelevance(llm=llm)
+        >>>
+        >>> # Single evaluation
+        >>> result = await metric.ascore(
+        ...     user_input="When was Einstein born?",
+        ...     retrieved_contexts=["Albert Einstein was born March 14, 1879."]
+        ... )
+        >>> print(f"Context Relevance: {result.value}")
+
+    Attributes:
+        llm: Modern instructor-based LLM for dual-judge evaluation
+        name: The metric name
+        allowed_values: Score range (0.0 to 1.0, higher is better)
+        max_retries: Maximum retry attempts for invalid ratings
+    """
+
+    # Type hints for linter (attributes are set in __init__)
+    llm: "InstructorBaseRagasLLM"
+
+    def __init__(
+        self,
+        llm: "InstructorBaseRagasLLM",
+        name: str = "context_relevance",
+        max_retries: int = 5,
+        **kwargs,
+    ):
+        """
+        Initialize ContextRelevance metric with required components.
+
+        Args:
+            llm: Modern instructor-based LLM for dual-judge evaluation
+            name: The metric name
+            max_retries: Maximum retry attempts for invalid ratings
+        """
+        # Set attributes explicitly before calling super()
+        self.llm = llm
+        self.max_retries = max_retries
+
+        # Call super() for validation (without passing llm in kwargs)
+        super().__init__(name=name, **kwargs)
+
+    async def ascore(
+        self, user_input: str, retrieved_contexts: List[str]
+    ) -> MetricResult:
+        """
+        Calculate context relevance score using dual-judge evaluation.
+
+        Args:
+            user_input: The original question
+            retrieved_contexts: The retrieved contexts to evaluate for relevance
+
+        Returns:
+            MetricResult with context relevance score (0.0-1.0, higher is better)
+        """
+        # Input validation
+        if not user_input:
+            raise ValueError(
+                "user_input is missing. Please add user_input to the test sample."
+            )
+        if not retrieved_contexts:
+            raise ValueError(
+                "retrieved_contexts is missing. Please add retrieved_contexts to the test sample."
+            )
+
+        # Handle edge cases like legacy
+        context_str = "\n".join(retrieved_contexts)
+
+        if not user_input.strip() or not context_str.strip():
+            return MetricResult(value=0.0)
+
+        # Edge case: if user input matches context exactly
+        if user_input.strip() == context_str.strip():
+            return MetricResult(value=0.0)
+
+        # Edge case: if context is contained in user input
+        if context_str.strip() in user_input.strip():
+            return MetricResult(value=0.0)
+
+        # Get ratings from both judges with NVIDIA temperature (0.1)
+        judge1_rating = await self._get_judge_rating(
+            context_relevance_judge1_prompt(user_input, context_str)
+        )
+        judge2_rating = await self._get_judge_rating(
+            context_relevance_judge2_prompt(user_input, context_str)
+        )
+
+        # Average the scores (convert from 0,1,2 scale to 0.0-1.0)
+        score = self._average_scores(judge1_rating / 2.0, judge2_rating / 2.0)
+
+        return MetricResult(value=float(score))
+
+    async def _get_judge_rating(self, prompt: str) -> float:
+        """Get rating from judge with retry logic and NVIDIA temperature."""
+        for retry in range(self.max_retries):
+            try:
+                result = await self.llm.agenerate(prompt, RelevanceRating)
+                rating = result.rating
+
+                # Validate rating is in expected range
+                if rating in [0, 1, 2]:
+                    return float(rating)
+                else:
+                    if retry < self.max_retries - 1:
+                        continue  # Retry if invalid rating
+                    else:
+                        return float("nan")
+
+            except Exception:
+                if retry < self.max_retries - 1:
+                    continue  # Retry on exception
+                else:
+                    return float("nan")
+
+        return float("nan")
+
+    def _average_scores(self, score1: float, score2: float) -> float:
+        """Average two judge scores, handling NaN values."""
+        if not np.isnan(score1) and not np.isnan(score2):
+            return (score1 + score2) / 2.0
+        elif not np.isnan(score1):
+            return score1
+        elif not np.isnan(score2):
+            return score2
+        else:
+            return float("nan")