Update binarization to be individual params (Azure#40168)

nagkumar91 · Nagkumar Arkalgud · Nagkumar Arkalgud · web-flow · commit 8d3bb37ee899 · 2025-03-24T16:34:29.000-07:00
* Update task_query_response.prompty

remove required keys

* Update task_simulate.prompty

* Update task_query_response.prompty

* Update task_simulate.prompty

* Fix the api_key needed

* Update for release

* Black fix for file

* Add original text in global context

* Update test

* Update the indirect attack simulator

* Black suggested fixes

* Update simulator prompty

* Update adversarial scenario enum to exclude XPIA

* Update changelog

* Black fixes

* Remove duplicate import

* Fix the mypy error

* Mypy please be happy

* Updates to non adv simulator

* accept context from assistant messages, exclude them when using them for conversation

* update changelog

* pylint fixes

* pylint fixes

* remove redundant quotes

* Fix typo

* pylint fix

* Update broken tests

* Include the grounding json in the manifest

* Fix typo

* Come on package

* Release 1.0.0b5

* Notice from Chang

* Remove adv_conv template parameters from the outputs

* Update chanagelog

* Experimental tags on adv scenarios

* Readme fix onbreaking change

* Add the category and both user and assistant context to the response of qr_json_lines

* Update changelog

* Rename _kwargs to _options

* _options as prefix

* update troubleshooting for simulator

* Rename according to suggestions

* Clean up readme

* more links

* Bugfix: zip_longest created null parameters

* Updated changelog

* zip does the job

* remove ununsed import

* Fix changelog merge

* Remove print statements

* Update the threshold to be individual parameters

* Remove higher is better as a public ref

---------

Co-authored-by: Nagkumar Arkalgud &lt;nagkumar@naarkalg-work-mac.local&gt;
Co-authored-by: Nagkumar Arkalgud &lt;nagkumar@naarkalgworkmac.lan&gt;
Co-authored-by: Nagkumar Arkalgud &lt;nagkumar@Mac.lan&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
@@ -62,6 +62,7 @@ def __init__(
         self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
         self._credential = credential
         self._threshold = threshold
+        self._higher_is_better = _higher_is_better
 
     @override
     def __call__(  # pylint: disable=docstring-missing-param
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -25,8 +25,14 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param threshold: The threshold for the content safety evaluator. Default is 3.
-    :type threshold: int
+    :param violence_threshold: The threshold for violence evaluation. Default is 3.
+    :type violence_threshold: int
+    :param sexual_threshold: The threshold for sexual content evaluation. Default is 3.
+    :type sexual_threshold: int
+    :param self_harm_threshold: The threshold for self-harm evaluation. Default is 3.
+    :type self_harm_threshold: int
+    :param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
+    :type hate_unfairness_threshold: int
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
     :return: A function that evaluates content-safety metrics for "question-answering" scenario.
@@ -40,7 +46,6 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
             :dedent: 8
             :caption: Initialize and call a ContentSafetyEvaluator.
     
-    # todo: should threshold be a dict like QAEvaluator?
     .. admonition:: Example with Threshold:
     
         .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -54,12 +59,31 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
     id = "content_safety"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
-    def __init__(self, credential, azure_ai_project, threshold=3, **kwargs):
+    def __init__(
+        self, 
+        credential, 
+        azure_ai_project, 
+        violence_threshold: int = 3,
+        sexual_threshold: int = 3,
+        self_harm_threshold: int = 3,
+        hate_unfairness_threshold: int = 3,
+        **kwargs
+    ):
+        # Type checking
+        for name, value in [
+            ("violence_threshold", violence_threshold),
+            ("sexual_threshold", sexual_threshold),
+            ("self_harm_threshold", self_harm_threshold),
+            ("hate_unfairness_threshold", hate_unfairness_threshold),
+        ]:
+            if not isinstance(value, int):
+                raise TypeError(f"{name} must be an int, got {type(value)}")
+        
         evaluators = [
-            ViolenceEvaluator(credential, azure_ai_project, threshold=threshold),
-            SexualEvaluator(credential, azure_ai_project, threshold=threshold),
-            SelfHarmEvaluator(credential, azure_ai_project, threshold=threshold),
-            HateUnfairnessEvaluator(credential, azure_ai_project, threshold=threshold),
+            ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold),
+            SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold),
+            SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold),
+            HateUnfairnessEvaluator(credential, azure_ai_project, threshold=hate_unfairness_threshold),
         ]
         super().__init__(evaluators=evaluators, **kwargs)
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py
@@ -2,7 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
-from typing import Optional, Union
+from typing import Union
 
 from typing_extensions import overload, override
 
@@ -23,13 +23,18 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
-    :param threshold: Optional dictionary of thresholds for different evaluation metrics.
-        Keys can be "groundedness", "relevance", "coherence", "fluency", "similarity",
-        and "f1_score". Default values are 3 for integer metrics and 0.5 for float
-        metrics. If None or an empty dictionary is provided, default values will be
-        used for all metrics. If a partial dictionary is provided, default values
-        will be used for any missing keys.
-    :type threshold: Optional[dict]
+    :param groundedness_threshold: The threshold for groundedness evaluation. Default is 3.
+    :type groundedness_threshold: int
+    :param relevance_threshold: The threshold for relevance evaluation. Default is 3.
+    :type relevance_threshold: int
+    :param coherence_threshold: The threshold for coherence evaluation. Default is 3.
+    :type coherence_threshold: int
+    :param fluency_threshold: The threshold for fluency evaluation. Default is 3.
+    :type fluency_threshold: int
+    :param similarity_threshold: The threshold for similarity evaluation. Default is 3.
+    :type similarity_threshold: int
+    :param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5.
+    :type f1_score_threshold: float
     :return: A callable class that evaluates and generates metrics for "question-answering" scenario.
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
@@ -62,31 +67,36 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
     id = "qa"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
-    def __init__(self, model_config, threshold: Optional[dict] = {}, **kwargs):
-        default_threshold = {
-            "groundedness": 3,
-            "relevance": 3,
-            "coherence": 3,
-            "fluency": 3,
-            "similarity": 3,
-            "f1_score": 0.5,
-        }
-        if threshold is None:
-            threshold = {}
-        for key in default_threshold.keys():
-            if key not in threshold:
-                threshold[key] = default_threshold[key]
-            if not isinstance(threshold[key], (int, float)):
-                raise TypeError(
-                    f"Threshold for {key} must be an int or float, got {type(threshold[key])}"
-                )
+    def __init__(
+        self,
+        model_config,
+        groundedness_threshold: int = 3,
+        relevance_threshold: int = 3,
+        coherence_threshold: int = 3,
+        fluency_threshold: int = 3,
+        similarity_threshold: int = 3,
+        f1_score_threshold: float = 0.5,
+        **kwargs
+    ):
+        # Type checking
+        for name, value in [
+            ("groundedness_threshold", groundedness_threshold),
+            ("relevance_threshold", relevance_threshold),
+            ("coherence_threshold", coherence_threshold),
+            ("fluency_threshold", fluency_threshold),
+            ("similarity_threshold", similarity_threshold),
+            ("f1_score_threshold", f1_score_threshold),
+        ]:
+            if not isinstance(value, (int, float)):
+                raise TypeError(f"{name} must be an int or float, got {type(value)}")
+
         evaluators = [
-            GroundednessEvaluator(model_config, threshold=threshold["groundedness"]),
-            RelevanceEvaluator(model_config, threshold=threshold["relevance"]),
-            CoherenceEvaluator(model_config, threshold=threshold["coherence"]),
-            FluencyEvaluator(model_config, threshold=threshold["fluency"]),
-            SimilarityEvaluator(model_config, threshold=threshold["similarity"]),
-            F1ScoreEvaluator(threshold=threshold["f1_score"]),
+            GroundednessEvaluator(model_config, threshold=groundedness_threshold),
+            RelevanceEvaluator(model_config, threshold=relevance_threshold),
+            CoherenceEvaluator(model_config, threshold=coherence_threshold),
+            FluencyEvaluator(model_config, threshold=fluency_threshold),
+            SimilarityEvaluator(model_config, threshold=similarity_threshold),
+            F1ScoreEvaluator(threshold=f1_score_threshold),
         ]
         super().__init__(evaluators=evaluators, **kwargs)
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_rouge/_rouge.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_rouge/_rouge.py
@@ -54,10 +54,12 @@ class RougeScoreEvaluator(EvaluatorBase):
     ROUGE scores range from 0 to 1, with higher scores indicating better quality.
     :param rouge_type: The type of ROUGE score to calculate. Default is "rouge1".
     :type rouge_type: str
-    :param threshold: The threshold value to determine if the evaluation passes or fails. 
-        Can be either a float (applied to all metrics) or a dictionary with separate thresholds for each metric 
-        {"precision": float, "recall": float, "f1_score": float}. Default is 0.5.
-    :type threshold: Union[float, dict]
+    :param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
+    :type precision_threshold: float
+    :param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
+    :type recall_threshold: float
+    :param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
+    :type f1_score_threshold: float
 
     .. admonition:: Example:
 
@@ -82,24 +84,31 @@ class RougeScoreEvaluator(EvaluatorBase):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
     @override
-    def __init__(self, rouge_type: RougeType, threshold: dict = {}):
+    def __init__(
+        self, 
+        rouge_type: RougeType, 
+        precision_threshold: float = 0.5,
+        recall_threshold: float = 0.5,
+        f1_score_threshold: float = 0.5
+    ):
         self._rouge_type = rouge_type
         self._higher_is_better = True
         super().__init__()
-        default_threshold = {
-            "precision": 0.5,
-            "recall": 0.5,
-            "f1_score": 0.5,
+        
+        # Type checking for threshold parameters
+        for name, value in [
+            ("precision_threshold", precision_threshold),
+            ("recall_threshold", recall_threshold),
+            ("f1_score_threshold", f1_score_threshold),
+        ]:
+            if not isinstance(value, float):
+                raise TypeError(f"{name} must be a float, got {type(value)}")
+                
+        self._threshold = {
+            "precision": precision_threshold,
+            "recall": recall_threshold,
+            "f1_score": f1_score_threshold,
         }
-        if not isinstance(threshold, dict):
-            raise TypeError(
-                f"Threshold must be a dictionary, got {type(threshold)}"
-            )
-        for key in default_threshold.keys():
-            if key not in threshold:
-                threshold[key] = default_threshold[key]
-
-        self._threshold = threshold
 
     def _get_binary_result(
             self,
@@ -130,23 +139,22 @@ def _get_binary_result(
         precision_valid = not math.isnan(rouge_precision)
         recall_valid = not math.isnan(rouge_recall)
         f1_valid = not math.isnan(rouge_f1_score)
-        if all(key in self._threshold for key in ["precision", "recall", "f1_score"]):
-            if self._higher_is_better:
-                if precision_valid:
-                    results["rouge_precision_result"] = (rouge_precision >= self._threshold["precision"])
-                if recall_valid:
-                    results["rouge_recall_result"] = (rouge_recall >= self._threshold["recall"])
-                if f1_valid:
-                    results["rouge_f1_score_result"] = (rouge_f1_score >= self._threshold["f1_score"])
-            else:
-                if precision_valid:
-                    results["rouge_precision_result"] = (rouge_precision <= self._threshold["precision"])
-                if recall_valid:
-                    results["rouge_recall_result"] = (rouge_recall <= self._threshold["recall"])
-                if f1_valid:
-                    results["rouge_f1_score_result"] = (rouge_f1_score <= self._threshold["f1_score"])
+        
+        if self._higher_is_better:
+            if precision_valid:
+                results["rouge_precision_result"] = (rouge_precision >= self._threshold["precision"])
+            if recall_valid:
+                results["rouge_recall_result"] = (rouge_recall >= self._threshold["recall"])
+            if f1_valid:
+                results["rouge_f1_score_result"] = (rouge_f1_score >= self._threshold["f1_score"])
         else:
-            raise ValueError("Threshold dictionary must contain 'precision', 'recall', and 'f1_score' keys.")
+            if precision_valid:
+                results["rouge_precision_result"] = (rouge_precision <= self._threshold["precision"])
+            if recall_valid:
+                results["rouge_recall_result"] = (rouge_recall <= self._threshold["recall"])
+            if f1_valid:
+                results["rouge_f1_score_result"] = (rouge_f1_score <= self._threshold["f1_score"])
+        
         return results
 
     @override
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py
@@ -65,18 +65,16 @@ def __init__(
         credential,
         azure_ai_project,
         threshold: int = 5,
-        _higher_is_better: bool = True,
         **kwargs,
     ):
         self.threshold = threshold
-        self._higher_is_better = _higher_is_better
+        self._higher_is_better = True
         self._output_prefix = "groundedness_pro"
         super().__init__(
             eval_metric=EvaluationMetrics.GROUNDEDNESS,
             azure_ai_project=azure_ai_project,
             credential=credential,
             threshold=self.threshold,
-            _higher_is_better=self._higher_is_better,
             **kwargs,
         )
 
diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_threshold.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_threshold.py
@@ -247,14 +247,15 @@ def evaluation_classes_methods_with_thresholds(self):
             "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
         }
 
-        qa_eval = QAEvaluator(model_config=model_config, threshold={
-            "groundedness": 2,
-            "relevance": 2,
-            "coherence": 2,
-            "fluency": 2,
-            "similarity": 2,
-            "f1_score": 0.5,
-        })
+        qa_eval = QAEvaluator(
+            model_config=model_config, 
+            groundedness_threshold=2,
+            relevance_threshold=2,
+            coherence_threshold=2,
+            fluency_threshold=2,
+            similarity_threshold=2,
+            f1_score_threshold=0.5
+        )
         qa_eval(query="This's the color?", response="Black", ground_truth="gray", context="gray")
         # [END threshold_qa_evaluator]
 
@@ -311,11 +312,9 @@ def evaluation_classes_methods_with_thresholds(self):
 
         rouge_evaluator = RougeScoreEvaluator(
             rouge_type=RougeType.ROUGE_4, 
-            threshold={
-                "precision": 0.5,
-                "recall": 0.5,
-                "f1_score": 0.5,
-            }
+            precision_threshold=0.5,
+            recall_threshold=0.5,
+            f1_score_threshold=0.5
         )
         rouge_evaluator(response="Paris is the capital of France.", ground_truth="France's capital is Paris.")
         # [END threshold_rouge_score_evaluator]
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_threshold_behavior.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_threshold_behavior.py
@@ -124,7 +124,7 @@ def test_f1_score_threshold(self, mock_call, threshold, score, should_pass):
 
 @pytest.mark.unittest
 class TestRougeThresholdBehavior:
-    """Tests for threshold behavior in Rouge evaluators which use dictionary thresholds."""
+    """Tests for threshold behavior in Rouge evaluators which use individual threshold parameters."""
     
     def test_rouge_default_threshold(self):
         """Test that default thresholds are set correctly in Rouge evaluator."""
@@ -137,15 +137,11 @@ def test_rouge_default_threshold(self):
     
     def test_rouge_custom_threshold(self):
         """Test that custom thresholds work correctly in Rouge evaluator."""
-        custom_threshold = {
-            "precision": 0.9,
-            "recall": 0.1,
-            "f1_score": 0.75
-        }
-        
         evaluator = RougeScoreEvaluator(
             rouge_type=RougeType.ROUGE_L, 
-            threshold=custom_threshold
+            precision_threshold=0.9,
+            recall_threshold=0.1,
+            f1_score_threshold=0.75
         )
         
         # Custom thresholds should be set
@@ -156,15 +152,11 @@ def test_rouge_custom_threshold(self):
     @patch("azure.ai.evaluation._evaluators._rouge._rouge.RougeScoreEvaluator.__call__")
     def test_rouge_threshold_behavior(self, mock_call):
         """Test threshold behavior with mocked Rouge scores."""
-        custom_threshold = {
-            "precision": 0.9,
-            "recall": 0.1,
-            "f1_score": 0.75
-        }
-        
         evaluator = RougeScoreEvaluator(
             rouge_type=RougeType.ROUGE_L, 
-            threshold=custom_threshold
+            precision_threshold=0.9,
+            recall_threshold=0.1,
+            f1_score_threshold=0.75
         )
         
         # Mock results with precision passing, recall failing, and f1_score passing
@@ -200,13 +192,12 @@ def test_rouge_threshold_behavior(self, mock_call):
     @patch("azure.ai.evaluation._evaluators._rouge._rouge.RougeScoreEvaluator.__call__")
     def test_rouge_different_types(self, mock_call, rouge_type):
         """Test that different Rouge types work correctly with thresholds."""
-        threshold = {
-            "precision": 0.5,
-            "recall": 0.5,
-            "f1_score": 0.5
-        }
-        
-        evaluator = RougeScoreEvaluator(rouge_type=rouge_type, threshold=threshold)
+        evaluator = RougeScoreEvaluator(
+            rouge_type=rouge_type, 
+            precision_threshold=0.5,
+            recall_threshold=0.5,
+            f1_score_threshold=0.5
+        )
         
         # Mock scores that all pass the threshold
         result = {