DocumentRetrievalEvaluator: Small fixes for importing, threshold setting and metrics output (Azure#40929)

abhahn · Abby Hartman · web-flow · commit 74203947fb04 · 2025-05-12T13:20:07.000-07:00
* Small fixes for importing, threshold setting and metrics output

* Updated threshold test

* Update eval mapping

---------

Co-authored-by: Abby Hartman &lt;abhahn@microsoft.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
@@ -31,6 +31,7 @@
 from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator
 from ._evaluators._ungrounded_attributes import UngroundedAttributesEvaluator
 from ._evaluators._tool_call_accuracy import ToolCallAccuracyEvaluator
+from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
 from ._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py
@@ -16,6 +16,7 @@
     CodeVulnerabilityEvaluator,
     CoherenceEvaluator,
     ContentSafetyEvaluator,
+    DocumentRetrievalEvaluator,
     F1ScoreEvaluator,
     FluencyEvaluator,
     GleuScoreEvaluator,
@@ -45,6 +46,7 @@
     CodeVulnerabilityEvaluator: "code_vulnerability",
     CoherenceEvaluator: "coherence",
     ContentSafetyEvaluator: "content_safety",
+    DocumentRetrievalEvaluator: "document_retrieval",
     ECIEvaluator: "eci",
     F1ScoreEvaluator: "f1_score",
     FluencyEvaluator: "fluency",
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py
@@ -4,7 +4,7 @@
 import math
 import operator
 from itertools import starmap
-from typing import Dict, List, TypedDict, Tuple, Optional
+from typing import Any, Dict, List, TypedDict, Tuple, Optional, Union
 from azure.ai.evaluation._evaluators._common import EvaluatorBase
 from azure.ai.evaluation._exceptions import EvaluationException
 from typing_extensions import override, overload
@@ -56,7 +56,13 @@ def __init__(
         *,
         ground_truth_label_min: int = 0,
         ground_truth_label_max: int = 4,
-        threshold: Optional[dict] = None,
+        ndcg_threshold: Optional[float] = 0.5,
+        xdcg_threshold: Optional[float] = 50.0,
+        fidelity_threshold: Optional[float] = 0.5,
+        top1_relevance_threshold: Optional[float] = 50.0,
+        top3_max_relevance_threshold: Optional[float] = 50.0,
+        total_retrieved_documents_threshold: Optional[int] = 50,
+        total_ground_truth_documents_threshold: Optional[int] = 50
     ):
         super().__init__()
         self.k = 3
@@ -81,27 +87,19 @@ def __init__(
         self.ground_truth_label_max = ground_truth_label_max
 
         # The default threshold for metrics where higher numbers are better.
-        self._threshold_metrics = {
-            "ndcg@3": 0.5,
-            "xdcg@3": 0.5,
-            "fidelity": 0.5,
-            "top1_relevance": 50,
-            "top3_max_relevance": 50,
-            "total_retrieved_documents": 50,
-            "total_ground_truth_documents": 50,
+        self._threshold_metrics: Dict[str, Any] = {
+            "ndcg@3": ndcg_threshold,
+            "xdcg@3": xdcg_threshold,
+            "fidelity": fidelity_threshold,
+            "top1_relevance": top1_relevance_threshold,
+            "top3_max_relevance": top3_max_relevance_threshold,
+            "total_retrieved_documents": total_retrieved_documents_threshold,
+            "total_ground_truth_documents": total_ground_truth_documents_threshold,
         }
 
         # Ideally, the number of holes should be zero.
         self._threshold_holes = {"holes": 0, "holes_ratio": 0}
 
-        if threshold and not isinstance(threshold, dict):
-            raise EvaluationException(
-                f"Threshold must be a dictionary, got {type(threshold)}"
-            )
-
-        elif isinstance(threshold, dict):
-            self._threshold_metrics.update(threshold)
-
     def _compute_holes(self, actual_docs: List[str], labeled_docs: List[str]) -> int:
         """
         The number of documents retrieved from a search query which have no provided ground-truth label.
@@ -224,22 +222,16 @@ def calculate_weighted_sum_by_rating(labels: List[int]) -> float:
         return weighted_sum_by_rating_results / float(weighted_sum_by_rating_index)
 
     def _get_binary_result(self, **metrics) -> Dict[str, float]:
-        result = {}
+        result: Dict[str, Any] = {}
 
         for metric_name, metric_value in metrics.items():
             if metric_name in self._threshold_metrics.keys():
-                result[f"{metric_name}_result"] = (
-                    metric_value >= self._threshold_metrics[metric_name]
-                )
-                result[f"{metric_name}_threshold"] = self._threshold_metrics[
-                    metric_name
-                ]
+                result[f"{metric_name}_result"] = "pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
+                result[f"{metric_name}_threshold"] = self._threshold_metrics[metric_name]
                 result[f"{metric_name}_higher_is_better"] = True
 
             elif metric_name in self._threshold_holes.keys():
-                result[f"{metric_name}_result"] = (
-                    metric_value <= self._threshold_holes[metric_name]
-                )
+                result[f"{metric_name}_result"] = "pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
                 result[f"{metric_name}_threshold"] = self._threshold_holes[metric_name]
                 result[f"{metric_name}_higher_is_better"] = False
 
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_document_retrieval_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_document_retrieval_evaluator.py
@@ -116,28 +116,27 @@ def test_incorrect_groundtruth_max():
         exc_info._excinfo[1]
     )
 
-def test_threshold(doc_retrieval_eval_data):
+def test_thresholds(doc_retrieval_eval_data):
     _, records = doc_retrieval_eval_data
     record = records[-1]
     custom_threshold_subset = {
-        "ndcg@3": 0.7,
-        "xdcg@3": 0.7,
-        "fidelity": 0.7,
+        "ndcg_threshold": 0.7,
+        "xdcg_threshold": 0.7,
+        "fidelity_threshold": 0.7,
     }
 
     custom_threshold_superset = {
-        "ndcg@3": 0.7,
-        "xdcg@3": 0.7,
-        "fidelity": 0.7,
-        "top1_relevance": 70,
-        "top3_max_relevance": 70,
-        "total_retrieved_documents": 10,
-        "total_ground_truth_documents": 10,
-        "unknown_metric": 50
+        "ndcg_threshold": 0.7,
+        "xdcg_threshold": 0.7,
+        "fidelity_threshold": 0.7,
+        "top1_relevance_threshold": 70,
+        "top3_max_relevance_threshold": 70,
+        "total_retrieved_documents_threshold": 10,
+        "total_ground_truth_documents_threshold": 10
     }
 
     for threshold in [custom_threshold_subset, custom_threshold_superset]:
-        evaluator = DocumentRetrievalEvaluator(ground_truth_label_min=0, ground_truth_label_max=2, threshold=threshold)
+        evaluator = DocumentRetrievalEvaluator(ground_truth_label_min=0, ground_truth_label_max=2, **threshold)
         results = evaluator(**record)
 
         expected_keys = [