Skip to content

Commit 89c53e7

Browse files
Copilotslister1001
andauthored
[evaluation] refactor _evaluate_query parameter to kwargs (#42168)
* Initial plan * Refactor _evaluate_query to public evaluate_query parameter with backward compatibility Co-authored-by: slister1001 <[email protected]> * Update changelog and add documentation for evaluate_query parameter refactoring Co-authored-by: slister1001 <[email protected]> * Refactor evaluate_query parameter to use kwargs pattern instead of explicit parameter Co-authored-by: slister1001 <[email protected]> * Remove backward compatibility for _evaluate_query parameter and update changelog Co-authored-by: slister1001 <[email protected]> * Remove all remaining references to deprecated _evaluate_query parameter Co-authored-by: slister1001 <[email protected]> * Run black formatter --------- Co-authored-by: copilot-swe-agent[bot] <[email protected]> Co-authored-by: slister1001 <[email protected]>
1 parent c14186a commit 89c53e7

File tree

13 files changed

+51
-42
lines changed

13 files changed

+51
-42
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
## 1.10.0 (2025-07-29)
44

55
### Breaking Changes
6-
- Added `_evaluate_query` parameter to `RaiServiceEvaluatorBase` class with a default value of `False`. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. Existing code that relies on queries being evaluated will need to explicitly set `_evaluate_query=True` to maintain the previous behavior.
6+
7+
- Added `evaluate_query` parameter to all RAI service evaluators that can be passed as a keyword argument. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. When set to `True`, both query and response will be evaluated; when set to `False` (default), only the response will be evaluated. This parameter is available across all RAI service evaluators including `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator`, `ProtectedMaterialEvaluator`, `IndirectAttackEvaluator`, `CodeVulnerabilityEvaluator`, `UngroundedAttributesEvaluator`, `GroundednessProEvaluator`, and `EciEvaluator`. Existing code that relies on queries being evaluated will need to explicitly set `evaluate_query=True` to maintain the previous behavior.
78

89
### Bugs Fixed
910

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,14 +88,17 @@ def __init__(
8888
self,
8989
credential,
9090
azure_ai_project,
91-
*,
92-
_evaluate_query: bool = True,
91+
**kwargs,
9392
):
93+
# Set default for evaluate_query if not provided
94+
if "evaluate_query" not in kwargs:
95+
kwargs["evaluate_query"] = True
96+
9497
super().__init__(
9598
eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
9699
azure_ai_project=azure_ai_project,
97100
credential=credential,
98-
_evaluate_query=_evaluate_query,
101+
**kwargs,
99102
)
100103

101104
@overload

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,12 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
4141
:type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
4242
:param threshold: The threshold for the evaluation. Default is 3.
4343
:type threshold: Optional[int]
44-
:param _evaluate_query: If True, the query will be included in the evaluation data when evaluating
45-
query-response pairs. If False, only the response will be evaluated. Default is False.
46-
:type _evaluate_query: bool
4744
:param _higher_is_better: If True, higher scores are better. Default is True.
4845
:type _higher_is_better: Optional[bool]
46+
:param evaluate_query: If True, the query will be included in the evaluation data when evaluating
47+
query-response pairs. If False, only the response will be evaluated. Default is False.
48+
Can be passed as a keyword argument.
49+
:type evaluate_query: bool
4950
"""
5051

5152
@override
@@ -57,8 +58,8 @@ def __init__(
5758
eval_last_turn: bool = False,
5859
conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
5960
threshold: int = 3,
60-
_evaluate_query: bool = False,
6161
_higher_is_better: Optional[bool] = False,
62+
**kwargs,
6263
):
6364
super().__init__(
6465
eval_last_turn=eval_last_turn,
@@ -70,7 +71,9 @@ def __init__(
7071
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
7172
self._credential = credential
7273
self._threshold = threshold
73-
self._evaluate_query = _evaluate_query
74+
75+
# Handle evaluate_query parameter from kwargs
76+
self._evaluate_query = kwargs.get("evaluate_query", False)
7477
self._higher_is_better = _higher_is_better
7578

7679
@override

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
3333
:type self_harm_threshold: int
3434
:param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
3535
:type hate_unfairness_threshold: int
36-
:param _evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
37-
:type _evaluate_query: bool
36+
:param evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
37+
:type evaluate_query: bool
3838
:param kwargs: Additional arguments to pass to the evaluator.
3939
:type kwargs: Any
4040
:return: A function that evaluates content-safety metrics for "question-answering" scenario.
@@ -81,7 +81,6 @@ def __init__(
8181
sexual_threshold: int = 3,
8282
self_harm_threshold: int = 3,
8383
hate_unfairness_threshold: int = 3,
84-
_evaluate_query: bool = False,
8584
**kwargs,
8685
):
8786
# Type checking
@@ -94,16 +93,17 @@ def __init__(
9493
if not isinstance(value, int):
9594
raise TypeError(f"{name} must be an int, got {type(value)}")
9695

96+
# Extract evaluate_query from kwargs if present
97+
evaluate_query_kwargs = {}
98+
if "evaluate_query" in kwargs:
99+
evaluate_query_kwargs["evaluate_query"] = kwargs["evaluate_query"]
100+
97101
evaluators = [
98-
ViolenceEvaluator(
99-
credential, azure_ai_project, threshold=violence_threshold, _evaluate_query=_evaluate_query
100-
),
101-
SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, _evaluate_query=_evaluate_query),
102-
SelfHarmEvaluator(
103-
credential, azure_ai_project, threshold=self_harm_threshold, _evaluate_query=_evaluate_query
104-
),
102+
ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold, **evaluate_query_kwargs),
103+
SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, **evaluate_query_kwargs),
104+
SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold, **evaluate_query_kwargs),
105105
HateUnfairnessEvaluator(
106-
credential, azure_ai_project, threshold=hate_unfairness_threshold, _evaluate_query=_evaluate_query
106+
credential, azure_ai_project, threshold=hate_unfairness_threshold, **evaluate_query_kwargs
107107
),
108108
]
109109
super().__init__(evaluators=evaluators, **kwargs)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(
9191
azure_ai_project,
9292
*,
9393
threshold: int = 3,
94-
_evaluate_query: bool = False,
94+
**kwargs,
9595
):
9696
super().__init__(
9797
eval_metric=EvaluationMetrics.HATE_FAIRNESS,
@@ -100,7 +100,7 @@ def __init__(
100100
conversation_aggregation_type=_AggregationType.MAX,
101101
threshold=threshold,
102102
_higher_is_better=False,
103-
_evaluate_query=_evaluate_query,
103+
**kwargs,
104104
)
105105

106106
@overload

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def __init__(
7676
azure_ai_project,
7777
*,
7878
threshold: int = 3,
79-
_evaluate_query: bool = False,
79+
**kwargs,
8080
):
8181
super().__init__(
8282
eval_metric=EvaluationMetrics.SELF_HARM,
@@ -85,7 +85,7 @@ def __init__(
8585
conversation_aggregation_type=_AggregationType.MAX,
8686
threshold=threshold,
8787
_higher_is_better=False,
88-
_evaluate_query=_evaluate_query,
88+
**kwargs,
8989
)
9090

9191
@overload

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def __init__(
8787
azure_ai_project,
8888
*,
8989
threshold: int = 3,
90-
_evaluate_query: bool = False,
90+
**kwargs,
9191
):
9292
super().__init__(
9393
eval_metric=EvaluationMetrics.SEXUAL,
@@ -96,7 +96,7 @@ def __init__(
9696
conversation_aggregation_type=_AggregationType.MAX,
9797
threshold=threshold,
9898
_higher_is_better=False,
99-
_evaluate_query=_evaluate_query,
99+
**kwargs,
100100
)
101101

102102
@overload

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def __init__(
8787
azure_ai_project,
8888
*,
8989
threshold: int = 3,
90-
_evaluate_query: bool = False,
90+
**kwargs,
9191
):
9292
super().__init__(
9393
eval_metric=EvaluationMetrics.VIOLENCE,
@@ -96,7 +96,7 @@ def __init__(
9696
conversation_aggregation_type=_AggregationType.MAX,
9797
threshold=threshold,
9898
_higher_is_better=False,
99-
_evaluate_query=_evaluate_query,
99+
**kwargs,
100100
)
101101

102102
@overload

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,13 @@ def __init__(
5959
self,
6060
credential,
6161
azure_ai_project,
62-
*,
63-
_evaluate_query: bool = False,
62+
**kwargs,
6463
):
6564
super().__init__(
6665
eval_metric=_InternalEvaluationMetrics.ECI,
6766
azure_ai_project=azure_ai_project,
6867
credential=credential,
69-
_evaluate_query=_evaluate_query,
68+
**kwargs,
7069
)
7170

7271
@overload

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,17 @@ def __init__(
5959
self,
6060
credential,
6161
azure_ai_project,
62-
*,
63-
_evaluate_query: bool = True,
62+
**kwargs,
6463
):
64+
# Set default for evaluate_query if not provided
65+
if "evaluate_query" not in kwargs:
66+
kwargs["evaluate_query"] = True
67+
6568
super().__init__(
6669
eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
6770
azure_ai_project=azure_ai_project,
6871
credential=credential,
69-
_evaluate_query=_evaluate_query,
72+
**kwargs,
7073
)
7174

7275
@overload

0 commit comments

Comments
 (0)