Skip to content

Commit 8d3bb37

Browse files
nagkumar91Nagkumar ArkalgudNagkumar ArkalgudNagkumar Arkalgud
authored
Update binarization to be individual params (Azure#40168)
* Update task_query_response.prompty remove required keys * Update task_simulate.prompty * Update task_query_response.prompty * Update task_simulate.prompty * Fix the api_key needed * Update for release * Black fix for file * Add original text in global context * Update test * Update the indirect attack simulator * Black suggested fixes * Update simulator prompty * Update adversarial scenario enum to exclude XPIA * Update changelog * Black fixes * Remove duplicate import * Fix the mypy error * Mypy please be happy * Updates to non adv simulator * accept context from assistant messages, exclude them when using them for conversation * update changelog * pylint fixes * pylint fixes * remove redundant quotes * Fix typo * pylint fix * Update broken tests * Include the grounding json in the manifest * Fix typo * Come on package * Release 1.0.0b5 * Notice from Chang * Remove adv_conv template parameters from the outputs * Update chanagelog * Experimental tags on adv scenarios * Readme fix onbreaking change * Add the category and both user and assistant context to the response of qr_json_lines * Update changelog * Rename _kwargs to _options * _options as prefix * update troubleshooting for simulator * Rename according to suggestions * Clean up readme * more links * Bugfix: zip_longest created null parameters * Updated changelog * zip does the job * remove ununsed import * Fix changelog merge * Remove print statements * Update the threshold to be individual parameters * Remove higher is better as a public ref --------- Co-authored-by: Nagkumar Arkalgud <[email protected]> Co-authored-by: Nagkumar Arkalgud <[email protected]> Co-authored-by: Nagkumar Arkalgud <[email protected]>
1 parent fefe1a3 commit 8d3bb37

File tree

7 files changed

+143
-112
lines changed

7 files changed

+143
-112
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def __init__(
6262
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
6363
self._credential = credential
6464
self._threshold = threshold
65+
self._higher_is_better = _higher_is_better
6566

6667
@override
6768
def __call__( # pylint: disable=docstring-missing-param

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,14 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
2525
:param azure_ai_project: The scope of the Azure AI project.
2626
It contains subscription id, resource group, and project name.
2727
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
28-
:param threshold: The threshold for the content safety evaluator. Default is 3.
29-
:type threshold: int
28+
:param violence_threshold: The threshold for violence evaluation. Default is 3.
29+
:type violence_threshold: int
30+
:param sexual_threshold: The threshold for sexual content evaluation. Default is 3.
31+
:type sexual_threshold: int
32+
:param self_harm_threshold: The threshold for self-harm evaluation. Default is 3.
33+
:type self_harm_threshold: int
34+
:param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
35+
:type hate_unfairness_threshold: int
3036
:param kwargs: Additional arguments to pass to the evaluator.
3137
:type kwargs: Any
3238
:return: A function that evaluates content-safety metrics for "question-answering" scenario.
@@ -40,7 +46,6 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
4046
:dedent: 8
4147
:caption: Initialize and call a ContentSafetyEvaluator.
4248
43-
# todo: should threshold be a dict like QAEvaluator?
4449
.. admonition:: Example with Threshold:
4550
4651
.. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -54,12 +59,31 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
5459
id = "content_safety"
5560
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
5661

57-
def __init__(self, credential, azure_ai_project, threshold=3, **kwargs):
62+
def __init__(
63+
self,
64+
credential,
65+
azure_ai_project,
66+
violence_threshold: int = 3,
67+
sexual_threshold: int = 3,
68+
self_harm_threshold: int = 3,
69+
hate_unfairness_threshold: int = 3,
70+
**kwargs
71+
):
72+
# Type checking
73+
for name, value in [
74+
("violence_threshold", violence_threshold),
75+
("sexual_threshold", sexual_threshold),
76+
("self_harm_threshold", self_harm_threshold),
77+
("hate_unfairness_threshold", hate_unfairness_threshold),
78+
]:
79+
if not isinstance(value, int):
80+
raise TypeError(f"{name} must be an int, got {type(value)}")
81+
5882
evaluators = [
59-
ViolenceEvaluator(credential, azure_ai_project, threshold=threshold),
60-
SexualEvaluator(credential, azure_ai_project, threshold=threshold),
61-
SelfHarmEvaluator(credential, azure_ai_project, threshold=threshold),
62-
HateUnfairnessEvaluator(credential, azure_ai_project, threshold=threshold),
83+
ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold),
84+
SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold),
85+
SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold),
86+
HateUnfairnessEvaluator(credential, azure_ai_project, threshold=hate_unfairness_threshold),
6387
]
6488
super().__init__(evaluators=evaluators, **kwargs)
6589

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py

Lines changed: 42 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5-
from typing import Optional, Union
5+
from typing import Union
66

77
from typing_extensions import overload, override
88

@@ -23,13 +23,18 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
2323
:param model_config: Configuration for the Azure OpenAI model.
2424
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
2525
~azure.ai.evaluation.OpenAIModelConfiguration]
26-
:param threshold: Optional dictionary of thresholds for different evaluation metrics.
27-
Keys can be "groundedness", "relevance", "coherence", "fluency", "similarity",
28-
and "f1_score". Default values are 3 for integer metrics and 0.5 for float
29-
metrics. If None or an empty dictionary is provided, default values will be
30-
used for all metrics. If a partial dictionary is provided, default values
31-
will be used for any missing keys.
32-
:type threshold: Optional[dict]
26+
:param groundedness_threshold: The threshold for groundedness evaluation. Default is 3.
27+
:type groundedness_threshold: int
28+
:param relevance_threshold: The threshold for relevance evaluation. Default is 3.
29+
:type relevance_threshold: int
30+
:param coherence_threshold: The threshold for coherence evaluation. Default is 3.
31+
:type coherence_threshold: int
32+
:param fluency_threshold: The threshold for fluency evaluation. Default is 3.
33+
:type fluency_threshold: int
34+
:param similarity_threshold: The threshold for similarity evaluation. Default is 3.
35+
:type similarity_threshold: int
36+
:param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5.
37+
:type f1_score_threshold: float
3338
:return: A callable class that evaluates and generates metrics for "question-answering" scenario.
3439
:param kwargs: Additional arguments to pass to the evaluator.
3540
:type kwargs: Any
@@ -62,31 +67,36 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
6267
id = "qa"
6368
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
6469

65-
def __init__(self, model_config, threshold: Optional[dict] = {}, **kwargs):
66-
default_threshold = {
67-
"groundedness": 3,
68-
"relevance": 3,
69-
"coherence": 3,
70-
"fluency": 3,
71-
"similarity": 3,
72-
"f1_score": 0.5,
73-
}
74-
if threshold is None:
75-
threshold = {}
76-
for key in default_threshold.keys():
77-
if key not in threshold:
78-
threshold[key] = default_threshold[key]
79-
if not isinstance(threshold[key], (int, float)):
80-
raise TypeError(
81-
f"Threshold for {key} must be an int or float, got {type(threshold[key])}"
82-
)
70+
def __init__(
71+
self,
72+
model_config,
73+
groundedness_threshold: int = 3,
74+
relevance_threshold: int = 3,
75+
coherence_threshold: int = 3,
76+
fluency_threshold: int = 3,
77+
similarity_threshold: int = 3,
78+
f1_score_threshold: float = 0.5,
79+
**kwargs
80+
):
81+
# Type checking
82+
for name, value in [
83+
("groundedness_threshold", groundedness_threshold),
84+
("relevance_threshold", relevance_threshold),
85+
("coherence_threshold", coherence_threshold),
86+
("fluency_threshold", fluency_threshold),
87+
("similarity_threshold", similarity_threshold),
88+
("f1_score_threshold", f1_score_threshold),
89+
]:
90+
if not isinstance(value, (int, float)):
91+
raise TypeError(f"{name} must be an int or float, got {type(value)}")
92+
8393
evaluators = [
84-
GroundednessEvaluator(model_config, threshold=threshold["groundedness"]),
85-
RelevanceEvaluator(model_config, threshold=threshold["relevance"]),
86-
CoherenceEvaluator(model_config, threshold=threshold["coherence"]),
87-
FluencyEvaluator(model_config, threshold=threshold["fluency"]),
88-
SimilarityEvaluator(model_config, threshold=threshold["similarity"]),
89-
F1ScoreEvaluator(threshold=threshold["f1_score"]),
94+
GroundednessEvaluator(model_config, threshold=groundedness_threshold),
95+
RelevanceEvaluator(model_config, threshold=relevance_threshold),
96+
CoherenceEvaluator(model_config, threshold=coherence_threshold),
97+
FluencyEvaluator(model_config, threshold=fluency_threshold),
98+
SimilarityEvaluator(model_config, threshold=similarity_threshold),
99+
F1ScoreEvaluator(threshold=f1_score_threshold),
90100
]
91101
super().__init__(evaluators=evaluators, **kwargs)
92102

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_rouge/_rouge.py

Lines changed: 42 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,12 @@ class RougeScoreEvaluator(EvaluatorBase):
5454
ROUGE scores range from 0 to 1, with higher scores indicating better quality.
5555
:param rouge_type: The type of ROUGE score to calculate. Default is "rouge1".
5656
:type rouge_type: str
57-
:param threshold: The threshold value to determine if the evaluation passes or fails.
58-
Can be either a float (applied to all metrics) or a dictionary with separate thresholds for each metric
59-
{"precision": float, "recall": float, "f1_score": float}. Default is 0.5.
60-
:type threshold: Union[float, dict]
57+
:param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
58+
:type precision_threshold: float
59+
:param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
60+
:type recall_threshold: float
61+
:param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
62+
:type f1_score_threshold: float
6163
6264
.. admonition:: Example:
6365
@@ -82,24 +84,31 @@ class RougeScoreEvaluator(EvaluatorBase):
8284
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
8385

8486
@override
85-
def __init__(self, rouge_type: RougeType, threshold: dict = {}):
87+
def __init__(
88+
self,
89+
rouge_type: RougeType,
90+
precision_threshold: float = 0.5,
91+
recall_threshold: float = 0.5,
92+
f1_score_threshold: float = 0.5
93+
):
8694
self._rouge_type = rouge_type
8795
self._higher_is_better = True
8896
super().__init__()
89-
default_threshold = {
90-
"precision": 0.5,
91-
"recall": 0.5,
92-
"f1_score": 0.5,
97+
98+
# Type checking for threshold parameters
99+
for name, value in [
100+
("precision_threshold", precision_threshold),
101+
("recall_threshold", recall_threshold),
102+
("f1_score_threshold", f1_score_threshold),
103+
]:
104+
if not isinstance(value, float):
105+
raise TypeError(f"{name} must be a float, got {type(value)}")
106+
107+
self._threshold = {
108+
"precision": precision_threshold,
109+
"recall": recall_threshold,
110+
"f1_score": f1_score_threshold,
93111
}
94-
if not isinstance(threshold, dict):
95-
raise TypeError(
96-
f"Threshold must be a dictionary, got {type(threshold)}"
97-
)
98-
for key in default_threshold.keys():
99-
if key not in threshold:
100-
threshold[key] = default_threshold[key]
101-
102-
self._threshold = threshold
103112

104113
def _get_binary_result(
105114
self,
@@ -130,23 +139,22 @@ def _get_binary_result(
130139
precision_valid = not math.isnan(rouge_precision)
131140
recall_valid = not math.isnan(rouge_recall)
132141
f1_valid = not math.isnan(rouge_f1_score)
133-
if all(key in self._threshold for key in ["precision", "recall", "f1_score"]):
134-
if self._higher_is_better:
135-
if precision_valid:
136-
results["rouge_precision_result"] = (rouge_precision >= self._threshold["precision"])
137-
if recall_valid:
138-
results["rouge_recall_result"] = (rouge_recall >= self._threshold["recall"])
139-
if f1_valid:
140-
results["rouge_f1_score_result"] = (rouge_f1_score >= self._threshold["f1_score"])
141-
else:
142-
if precision_valid:
143-
results["rouge_precision_result"] = (rouge_precision <= self._threshold["precision"])
144-
if recall_valid:
145-
results["rouge_recall_result"] = (rouge_recall <= self._threshold["recall"])
146-
if f1_valid:
147-
results["rouge_f1_score_result"] = (rouge_f1_score <= self._threshold["f1_score"])
142+
143+
if self._higher_is_better:
144+
if precision_valid:
145+
results["rouge_precision_result"] = (rouge_precision >= self._threshold["precision"])
146+
if recall_valid:
147+
results["rouge_recall_result"] = (rouge_recall >= self._threshold["recall"])
148+
if f1_valid:
149+
results["rouge_f1_score_result"] = (rouge_f1_score >= self._threshold["f1_score"])
148150
else:
149-
raise ValueError("Threshold dictionary must contain 'precision', 'recall', and 'f1_score' keys.")
151+
if precision_valid:
152+
results["rouge_precision_result"] = (rouge_precision <= self._threshold["precision"])
153+
if recall_valid:
154+
results["rouge_recall_result"] = (rouge_recall <= self._threshold["recall"])
155+
if f1_valid:
156+
results["rouge_f1_score_result"] = (rouge_f1_score <= self._threshold["f1_score"])
157+
150158
return results
151159

152160
@override

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,18 +65,16 @@ def __init__(
6565
credential,
6666
azure_ai_project,
6767
threshold: int = 5,
68-
_higher_is_better: bool = True,
6968
**kwargs,
7069
):
7170
self.threshold = threshold
72-
self._higher_is_better = _higher_is_better
71+
self._higher_is_better = True
7372
self._output_prefix = "groundedness_pro"
7473
super().__init__(
7574
eval_metric=EvaluationMetrics.GROUNDEDNESS,
7675
azure_ai_project=azure_ai_project,
7776
credential=credential,
7877
threshold=self.threshold,
79-
_higher_is_better=self._higher_is_better,
8078
**kwargs,
8179
)
8280

sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_threshold.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -247,14 +247,15 @@ def evaluation_classes_methods_with_thresholds(self):
247247
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
248248
}
249249

250-
qa_eval = QAEvaluator(model_config=model_config, threshold={
251-
"groundedness": 2,
252-
"relevance": 2,
253-
"coherence": 2,
254-
"fluency": 2,
255-
"similarity": 2,
256-
"f1_score": 0.5,
257-
})
250+
qa_eval = QAEvaluator(
251+
model_config=model_config,
252+
groundedness_threshold=2,
253+
relevance_threshold=2,
254+
coherence_threshold=2,
255+
fluency_threshold=2,
256+
similarity_threshold=2,
257+
f1_score_threshold=0.5
258+
)
258259
qa_eval(query="This's the color?", response="Black", ground_truth="gray", context="gray")
259260
# [END threshold_qa_evaluator]
260261

@@ -311,11 +312,9 @@ def evaluation_classes_methods_with_thresholds(self):
311312

312313
rouge_evaluator = RougeScoreEvaluator(
313314
rouge_type=RougeType.ROUGE_4,
314-
threshold={
315-
"precision": 0.5,
316-
"recall": 0.5,
317-
"f1_score": 0.5,
318-
}
315+
precision_threshold=0.5,
316+
recall_threshold=0.5,
317+
f1_score_threshold=0.5
319318
)
320319
rouge_evaluator(response="Paris is the capital of France.", ground_truth="France's capital is Paris.")
321320
# [END threshold_rouge_score_evaluator]

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_threshold_behavior.py

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def test_f1_score_threshold(self, mock_call, threshold, score, should_pass):
124124

125125
@pytest.mark.unittest
126126
class TestRougeThresholdBehavior:
127-
"""Tests for threshold behavior in Rouge evaluators which use dictionary thresholds."""
127+
"""Tests for threshold behavior in Rouge evaluators which use individual threshold parameters."""
128128

129129
def test_rouge_default_threshold(self):
130130
"""Test that default thresholds are set correctly in Rouge evaluator."""
@@ -137,15 +137,11 @@ def test_rouge_default_threshold(self):
137137

138138
def test_rouge_custom_threshold(self):
139139
"""Test that custom thresholds work correctly in Rouge evaluator."""
140-
custom_threshold = {
141-
"precision": 0.9,
142-
"recall": 0.1,
143-
"f1_score": 0.75
144-
}
145-
146140
evaluator = RougeScoreEvaluator(
147141
rouge_type=RougeType.ROUGE_L,
148-
threshold=custom_threshold
142+
precision_threshold=0.9,
143+
recall_threshold=0.1,
144+
f1_score_threshold=0.75
149145
)
150146

151147
# Custom thresholds should be set
@@ -156,15 +152,11 @@ def test_rouge_custom_threshold(self):
156152
@patch("azure.ai.evaluation._evaluators._rouge._rouge.RougeScoreEvaluator.__call__")
157153
def test_rouge_threshold_behavior(self, mock_call):
158154
"""Test threshold behavior with mocked Rouge scores."""
159-
custom_threshold = {
160-
"precision": 0.9,
161-
"recall": 0.1,
162-
"f1_score": 0.75
163-
}
164-
165155
evaluator = RougeScoreEvaluator(
166156
rouge_type=RougeType.ROUGE_L,
167-
threshold=custom_threshold
157+
precision_threshold=0.9,
158+
recall_threshold=0.1,
159+
f1_score_threshold=0.75
168160
)
169161

170162
# Mock results with precision passing, recall failing, and f1_score passing
@@ -200,13 +192,12 @@ def test_rouge_threshold_behavior(self, mock_call):
200192
@patch("azure.ai.evaluation._evaluators._rouge._rouge.RougeScoreEvaluator.__call__")
201193
def test_rouge_different_types(self, mock_call, rouge_type):
202194
"""Test that different Rouge types work correctly with thresholds."""
203-
threshold = {
204-
"precision": 0.5,
205-
"recall": 0.5,
206-
"f1_score": 0.5
207-
}
208-
209-
evaluator = RougeScoreEvaluator(rouge_type=rouge_type, threshold=threshold)
195+
evaluator = RougeScoreEvaluator(
196+
rouge_type=rouge_type,
197+
precision_threshold=0.5,
198+
recall_threshold=0.5,
199+
f1_score_threshold=0.5
200+
)
210201

211202
# Mock scores that all pass the threshold
212203
result = {

0 commit comments

Comments
 (0)