[evaluation] refactor _evaluate_query parameter to kwargs (#42168)

Copilot · slister1001 · web-flow · commit 89c53e7b6926 · 2025-07-24T15:33:58.000Z
* Initial plan

* Refactor _evaluate_query to public evaluate_query parameter with backward compatibility

Co-authored-by: slister1001 &lt;103153180+slister1001@users.noreply.github.com&gt;

* Update changelog and add documentation for evaluate_query parameter refactoring

Co-authored-by: slister1001 &lt;103153180+slister1001@users.noreply.github.com&gt;

* Refactor evaluate_query parameter to use kwargs pattern instead of explicit parameter

Co-authored-by: slister1001 &lt;103153180+slister1001@users.noreply.github.com&gt;

* Remove backward compatibility for _evaluate_query parameter and update changelog

Co-authored-by: slister1001 &lt;103153180+slister1001@users.noreply.github.com&gt;

* Remove all remaining references to deprecated _evaluate_query parameter

Co-authored-by: slister1001 &lt;103153180+slister1001@users.noreply.github.com&gt;

* Run black formatter

---------

Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
Co-authored-by: slister1001 &lt;103153180+slister1001@users.noreply.github.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -3,7 +3,8 @@
 ## 1.10.0 (2025-07-29)
 
 ### Breaking Changes
-- Added `_evaluate_query` parameter to `RaiServiceEvaluatorBase` class with a default value of `False`. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. Existing code that relies on queries being evaluated will need to explicitly set `_evaluate_query=True` to maintain the previous behavior.
+
+- Added `evaluate_query` parameter to all RAI service evaluators that can be passed as a keyword argument. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. When set to `True`, both query and response will be evaluated; when set to `False` (default), only the response will be evaluated. This parameter is available across all RAI service evaluators including `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator`, `ProtectedMaterialEvaluator`, `IndirectAttackEvaluator`, `CodeVulnerabilityEvaluator`, `UngroundedAttributesEvaluator`, `GroundednessProEvaluator`, and `EciEvaluator`.  Existing code that relies on queries being evaluated will need to explicitly set `evaluate_query=True` to maintain the previous behavior.
 
 ### Bugs Fixed
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py
@@ -88,14 +88,17 @@ def __init__(
         self,
         credential,
         azure_ai_project,
-        *,
-        _evaluate_query: bool = True,
+        **kwargs,
     ):
+        # Set default for evaluate_query if not provided
+        if "evaluate_query" not in kwargs:
+            kwargs["evaluate_query"] = True
+
         super().__init__(
             eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
             azure_ai_project=azure_ai_project,
             credential=credential,
-            _evaluate_query=_evaluate_query,
+            **kwargs,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
@@ -41,11 +41,12 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
     :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
     :param threshold: The threshold for the evaluation. Default is 3.
     :type threshold: Optional[int]
-    :param _evaluate_query: If True, the query will be included in the evaluation data when evaluating
-        query-response pairs. If False, only the response will be evaluated. Default is False.
-    :type _evaluate_query: bool
     :param _higher_is_better: If True, higher scores are better. Default is True.
     :type _higher_is_better: Optional[bool]
+    :param evaluate_query: If True, the query will be included in the evaluation data when evaluating
+        query-response pairs. If False, only the response will be evaluated. Default is False.
+        Can be passed as a keyword argument.
+    :type evaluate_query: bool
     """
 
     @override
@@ -57,8 +58,8 @@ def __init__(
         eval_last_turn: bool = False,
         conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
         threshold: int = 3,
-        _evaluate_query: bool = False,
         _higher_is_better: Optional[bool] = False,
+        **kwargs,
     ):
         super().__init__(
             eval_last_turn=eval_last_turn,
@@ -70,7 +71,9 @@ def __init__(
         self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
         self._credential = credential
         self._threshold = threshold
-        self._evaluate_query = _evaluate_query
+
+        # Handle evaluate_query parameter from kwargs
+        self._evaluate_query = kwargs.get("evaluate_query", False)
         self._higher_is_better = _higher_is_better
 
     @override
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -33,8 +33,8 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
     :type self_harm_threshold: int
     :param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
     :type hate_unfairness_threshold: int
-    :param _evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
-    :type _evaluate_query: bool
+    :param evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
+    :type evaluate_query: bool
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
     :return: A function that evaluates content-safety metrics for "question-answering" scenario.
@@ -81,7 +81,6 @@ def __init__(
         sexual_threshold: int = 3,
         self_harm_threshold: int = 3,
         hate_unfairness_threshold: int = 3,
-        _evaluate_query: bool = False,
         **kwargs,
     ):
         # Type checking
@@ -94,16 +93,17 @@ def __init__(
             if not isinstance(value, int):
                 raise TypeError(f"{name} must be an int, got {type(value)}")
 
+        # Extract evaluate_query from kwargs if present
+        evaluate_query_kwargs = {}
+        if "evaluate_query" in kwargs:
+            evaluate_query_kwargs["evaluate_query"] = kwargs["evaluate_query"]
+
         evaluators = [
-            ViolenceEvaluator(
-                credential, azure_ai_project, threshold=violence_threshold, _evaluate_query=_evaluate_query
-            ),
-            SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, _evaluate_query=_evaluate_query),
-            SelfHarmEvaluator(
-                credential, azure_ai_project, threshold=self_harm_threshold, _evaluate_query=_evaluate_query
-            ),
+            ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold, **evaluate_query_kwargs),
+            SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, **evaluate_query_kwargs),
+            SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold, **evaluate_query_kwargs),
             HateUnfairnessEvaluator(
-                credential, azure_ai_project, threshold=hate_unfairness_threshold, _evaluate_query=_evaluate_query
+                credential, azure_ai_project, threshold=hate_unfairness_threshold, **evaluate_query_kwargs
             ),
         ]
         super().__init__(evaluators=evaluators, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
@@ -91,7 +91,7 @@ def __init__(
         azure_ai_project,
         *,
         threshold: int = 3,
-        _evaluate_query: bool = False,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.HATE_FAIRNESS,
@@ -100,7 +100,7 @@ def __init__(
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
-            _evaluate_query=_evaluate_query,
+            **kwargs,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
@@ -76,7 +76,7 @@ def __init__(
         azure_ai_project,
         *,
         threshold: int = 3,
-        _evaluate_query: bool = False,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.SELF_HARM,
@@ -85,7 +85,7 @@ def __init__(
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
-            _evaluate_query=_evaluate_query,
+            **kwargs,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
@@ -87,7 +87,7 @@ def __init__(
         azure_ai_project,
         *,
         threshold: int = 3,
-        _evaluate_query: bool = False,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.SEXUAL,
@@ -96,7 +96,7 @@ def __init__(
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
-            _evaluate_query=_evaluate_query,
+            **kwargs,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
@@ -87,7 +87,7 @@ def __init__(
         azure_ai_project,
         *,
         threshold: int = 3,
-        _evaluate_query: bool = False,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.VIOLENCE,
@@ -96,7 +96,7 @@ def __init__(
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
-            _evaluate_query=_evaluate_query,
+            **kwargs,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py
@@ -59,14 +59,13 @@ def __init__(
         self,
         credential,
         azure_ai_project,
-        *,
-        _evaluate_query: bool = False,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=_InternalEvaluationMetrics.ECI,
             azure_ai_project=azure_ai_project,
             credential=credential,
-            _evaluate_query=_evaluate_query,
+            **kwargs,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
@@ -59,14 +59,17 @@ def __init__(
         self,
         credential,
         azure_ai_project,
-        *,
-        _evaluate_query: bool = True,
+        **kwargs,
     ):
+        # Set default for evaluate_query if not provided
+        if "evaluate_query" not in kwargs:
+            kwargs["evaluate_query"] = True
+
         super().__init__(
             eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
             azure_ai_project=azure_ai_project,
             credential=credential,
-            _evaluate_query=_evaluate_query,
+            **kwargs,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py
@@ -77,7 +77,6 @@ def __init__(
         azure_ai_project,
         *,
         threshold: int = 5,
-        _evaluate_query: bool = False,
         **kwargs,
     ):
         self.threshold = threshold
@@ -88,7 +87,6 @@ def __init__(
             azure_ai_project=azure_ai_project,
             credential=credential,
             threshold=self.threshold,
-            _evaluate_query=_evaluate_query,
             **kwargs,
         )
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py
@@ -67,14 +67,17 @@ def __init__(
         self,
         credential,
         azure_ai_project,
-        *,
-        _evaluate_query: bool = True,
+        **kwargs,
     ):
+        # Set default for evaluate_query if not provided
+        if "evaluate_query" not in kwargs:
+            kwargs["evaluate_query"] = True
+
         super().__init__(
             eval_metric=EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
             azure_ai_project=azure_ai_project,
             credential=credential,
-            _evaluate_query=_evaluate_query,
+            **kwargs,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
@@ -76,14 +76,13 @@ def __init__(
         self,
         credential,
         azure_ai_project,
-        *,
-        _evaluate_query: bool = False,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.XPIA,
             azure_ai_project=azure_ai_project,
             credential=credential,
-            _evaluate_query=_evaluate_query,
+            **kwargs,
         )
 
     @overload