azure-sdk
diff --git a/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md‎
Lines changed: 8 additions & 1 deletion b/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py‎
Lines changed: 5 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py‎
Lines changed: 23 additions & 3 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py‎
Lines changed: 23 additions & 3 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py‎
Lines changed: 21 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py‎
Lines changed: 32 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py‎
Lines changed: 32 additions & 2 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py‎
Lines changed: 3 additions & 1 deletion b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py‎
Lines changed: 43 additions & 4 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py‎
Lines changed: 43 additions & 4 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py‎
Lines changed: 9 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py‎
Lines changed: 17 additions & 5 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py‎
Lines changed: 17 additions & 5 deletions
@@ -3,8 +3,15 @@
 ## 1.4.0 (Unreleased)
 
 ### Features Added
+- Enhanced binary evaluation results with customizable thresholds
+  - Added threshold support for QA and ContentSafety evaluators
+  - Evaluation results now include both the score and threshold values
+  - Configurable threshold parameter allows custom binary classification boundaries
+  - Default thresholds provided for backward compatibility
+  - Quality evaluators use "higher is better" scoring (score ≥ threshold is positive)
+  - Content safety evaluators use "lower is better" scoring (score ≤ threshold is positive)
 - New Built-in evaluator called CodeVulnerabilityEvaluator is added. 
-  - It provides a capabilities to identify the following code vulnerabilities.
+  - It provides capabilities to identify the following code vulnerabilities.
     - path-injection
     - sql-injection
     - code-injection
 
@@ -94,3 +94,8 @@ class _AggregationType(enum.Enum):
 AZURE_OPENAI_TYPE: Literal["azure_openai"] = "azure_openai"
 
 OPENAI_TYPE: Literal["openai"] = "openai"
+
+EVALUATION_PASS_FAIL_MAPPING = {
+    True: "pass",
+    False: "fail",
+}
@@ -8,6 +8,7 @@
 from azure.ai.evaluation._common.utils import nltk_tokenize
 
 from azure.ai.evaluation._evaluators._common import EvaluatorBase
+from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 
 
 class BleuScoreEvaluator(EvaluatorBase):
@@ -22,6 +23,8 @@ class BleuScoreEvaluator(EvaluatorBase):
     indicator of quality.
 
     The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
+    :param threshold: The threshold for the evaluation. Default is 0.5.
+    :type threshold: float
 
     .. admonition:: Example:
 
@@ -31,17 +34,27 @@ class BleuScoreEvaluator(EvaluatorBase):
             :language: python
             :dedent: 8
             :caption: Initialize and call an BleuScoreEvaluator.
+
+    .. admonition:: Example with Threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_bleu_score_evaluator]
+            :end-before: [END threshold_bleu_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call an BleuScoreEvaluator.
     """
 
     id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, threshold=0.5):
+        self._threshold = threshold
+        self._higher_is_better = True
+        super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
 
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
-        """Produce a glue score evaluation result.
+        """Produce a bleu score evaluation result.
 
         :param eval_input: The input to the evaluation function.
         :type eval_input: Dict
@@ -56,9 +69,16 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
         # NIST Smoothing
         smoothing_function = SmoothingFunction().method4
         score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
+        binary_result = False
+        if self._higher_is_better:
+            binary_result = score >= self._threshold
+        else:
+            binary_result = score <= self._threshold
 
         return {
             "bleu_score": score,
+            "bleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
+            "bleu_threshold": self._threshold,
         }
 
     @overload  # type: ignore
 
@@ -21,6 +21,8 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
+    :param threshold: The threshold for the coherence evaluator. Default is 3.
+    :type threshold: int
 
     .. admonition:: Example:
 
@@ -30,6 +32,15 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :language: python
             :dedent: 8
             :caption: Initialize and call a CoherenceEvaluator with a query and response.
+    
+    .. admonition:: Example with Threshold:
+
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_coherence_evaluator]
+            :end-before: [END threshold_coherence_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and and call a CoherenceEvaluator with a query and response.
 
     .. note::
 
@@ -45,10 +56,18 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
     @override
-    def __init__(self, model_config):
+    def __init__(self, model_config, threshold=3):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+        self._threshold = threshold
+        self._higher_is_better = True
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            threshold=threshold,
+            _higher_is_better=self._higher_is_better
+        )
 
     @overload
     def __call__(
 
@@ -11,7 +11,7 @@
 
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._common.utils import remove_optional_singletons
-from azure.ai.evaluation._constants import _AggregationType
+from azure.ai.evaluation._constants import _AggregationType, EVALUATION_PASS_FAIL_MAPPING
 from azure.ai.evaluation._model_configurations import Conversation
 from azure.ai.evaluation._common._experimental import experimental
 
@@ -80,6 +80,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
     :param conversation_aggregator_override: A function that will be used to aggregate per-turn results. If provided,
         overrides the standard aggregator implied by conversation_aggregation_type. None by default.
     :type conversation_aggregator_override: Optional[Callable[[List[float]], float]]
+    :param threshold: The threshold for the evaluation. Default is 3.
+    :type threshold: Optional[int]
+    :param _higher_is_better: If True, higher scores are better. Default is True.
+    :type _higher_is_better: Optional[bool]
     """
 
     # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
@@ -89,16 +93,20 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
     def __init__(
         self,
         *,
+        threshold: float = 3.0,
         not_singleton_inputs: List[str] = ["conversation", "kwargs"],
         eval_last_turn: bool = False,
         conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
         conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
+        _higher_is_better: Optional[bool] = True,
     ):
         self._not_singleton_inputs = not_singleton_inputs
         self._eval_last_turn = eval_last_turn
         self._singleton_inputs = self._derive_singleton_inputs()
         self._async_evaluator = AsyncEvaluatorBase(self._real_call)
         self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
+        self._higher_is_better = _higher_is_better
+        self._threshold = threshold
         if conversation_aggregator_override is not None:
             # Type ignore since we already checked for None, but mypy doesn't know that.
             self._conversation_aggregation_function = conversation_aggregator_override  # type: ignore[assignment]
@@ -393,7 +401,29 @@ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], Aggrega
         per_turn_results = []
         # Evaluate all inputs.
         for eval_input in eval_input_list:
-            per_turn_results.append(await self._do_eval(eval_input))
+            result = await self._do_eval(eval_input)
+            # logic to determine threshold pass/fail
+            try:
+                for key in list(result.keys()):
+                    if key.endswith("_score") and "rouge" not in key:
+                        score_value = result[key]
+                        base_key = key[:-6]  # Remove "_score" suffix
+                        result_key = f"{base_key}_result"
+                        threshold_key = f"{base_key}_threshold"
+                        result[threshold_key] = self._threshold
+                        if self._higher_is_better:
+                            if int(score_value) >= self._threshold:
+                                result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
+                            else:
+                                result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
+                        else:
+                            if int(score_value) <= self._threshold:
+                                result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
+                            else:
+                                result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
+            except Exception as e:
+                print(f"Error calculating binary result: {e}")
+            per_turn_results.append(result)
         # Return results as-is if only one result was produced.
 
         if len(per_turn_results) == 1:
 
@@ -27,7 +27,9 @@ class MultiEvaluatorBase(EvaluatorBase[T]):
     """
 
     def __init__(self, evaluators: List[EvaluatorBase[T]], **kwargs):
-        super().__init__()
+        self._threshold = kwargs.pop("threshold", 3)
+        self._higher_is_better = kwargs.pop("_higher_is_better", False)
+        super().__init__(threshold=self._threshold, _higher_is_better=self._higher_is_better)
         self._parallel = kwargs.pop("_parallel", True)
         self._evaluators = evaluators
 
 
@@ -10,6 +10,7 @@
 from typing_extensions import override
 
 from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
+from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
 from . import EvaluatorBase
@@ -43,10 +44,12 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
     _LLM_CALL_TIMEOUT = 600
     _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
 
-    def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False):
+    def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False, threshold: int = 3, _higher_is_better: bool = False):
         self._result_key = result_key
         self._prompty_file = prompty_file
-        super().__init__(eval_last_turn=eval_last_turn)
+        self._threshold = threshold
+        self._higher_is_better = _higher_is_better
+        super().__init__(eval_last_turn=eval_last_turn, threshold=threshold, _higher_is_better=_higher_is_better)
 
         subclass_name = self.__class__.__name__
         user_agent = f"{USER_AGENT} (type=evaluator subtype={subclass_name})"
@@ -60,6 +63,26 @@ def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, ev
 
     # __call__ not overridden here because child classes have such varied signatures that there's no point
     # defining a default here.
+    def _get_binary_result(self, score: float) -> str:
+        """Get the binary result based on the score.
+
+        :param score: The score to evaluate.
+        :type score: float
+        :return: The binary result.
+        :rtype: str
+        """
+        if math.isnan(score):
+            return "unknown"
+        if self._higher_is_better:
+            if score >= self._threshold:
+                return EVALUATION_PASS_FAIL_MAPPING[True]
+            else:
+                return EVALUATION_PASS_FAIL_MAPPING[False]
+        else:
+            if score <= self._threshold:
+                return EVALUATION_PASS_FAIL_MAPPING[True]
+            else:
+                return EVALUATION_PASS_FAIL_MAPPING[False]
 
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # type: ignore[override]
@@ -87,13 +110,29 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             # Parse out score and reason from evaluators known to possess them.
             if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
                 score, reason = parse_quality_evaluator_reason_score(llm_output)
+                binary_result = self._get_binary_result(score)
                 return {
                     self._result_key: float(score),
                     f"gpt_{self._result_key}": float(score),
                     f"{self._result_key}_reason": reason,
+                    f"{self._result_key}_result": binary_result,
+                    f"{self._result_key}_threshold": self._threshold,
                 }
             match = re.search(r"\d", llm_output)
             if match:
                 score = float(match.group())
-            return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
-        return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
+                binary_result = self._get_binary_result(score)
+            return {
+                self._result_key: float(score), 
+                f"gpt_{self._result_key}": float(score),
+                f"{self._result_key}_result": binary_result,
+                f"{self._result_key}_threshold": self._threshold,
+            }
+
+        binary_result = self._get_binary_result(score)
+        return {
+            self._result_key: float(score), 
+            f"gpt_{self._result_key}": float(score),
+            f"{self._result_key}_result": binary_result,
+            f"{self._result_key}_threshold": self._threshold,
+        }
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Dict, TypeVar, Union
+from typing import Dict, TypeVar, Union, Optional
 
 from typing_extensions import override
 
@@ -40,6 +40,10 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         to produce a single result.
         Default is ~azure.ai.evaluation._AggregationType.MEAN.
     :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
+    :param threshold: The threshold for the evaluation. Default is 3.
+    :type threshold: Optional[int]
+    :param _higher_is_better: If True, higher scores are better. Default is True.
+    :type _higher_is_better: Optional[bool]
     """
 
     @override
@@ -50,11 +54,14 @@ def __init__(
         credential: TokenCredential,
         eval_last_turn: bool = False,
         conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
+        threshold: int = 3,
+        _higher_is_better: Optional[bool] = False,
     ):
-        super().__init__(eval_last_turn=eval_last_turn, conversation_aggregation_type=conversation_aggregation_type)
+        super().__init__(eval_last_turn=eval_last_turn, conversation_aggregation_type=conversation_aggregation_type, threshold=threshold, _higher_is_better=_higher_is_better)
         self._eval_metric = eval_metric
         self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
         self._credential = credential
+        self._threshold = threshold
 
     @override
     def __call__(  # pylint: disable=docstring-missing-param
 
@@ -25,6 +25,8 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :param threshold: The threshold for the content safety evaluator. Default is 3.
+    :type threshold: int
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
     :return: A function that evaluates content-safety metrics for "question-answering" scenario.
@@ -37,17 +39,27 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
             :language: python
             :dedent: 8
             :caption: Initialize and call a ContentSafetyEvaluator.
+    
+    # todo: should threshold be a dict like QAEvaluator?
+    .. admonition:: Example with Threshold:
+    
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_content_safety_evaluator]
+            :end-before: [END threshold_content_safety_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call a ContentSafetyEvaluator.
     """
 
     id = "content_safety"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
-    def __init__(self, credential, azure_ai_project, **kwargs):
+    def __init__(self, credential, azure_ai_project, threshold=3, **kwargs):
         evaluators = [
-            ViolenceEvaluator(credential, azure_ai_project),
-            SexualEvaluator(credential, azure_ai_project),
-            SelfHarmEvaluator(credential, azure_ai_project),
-            HateUnfairnessEvaluator(credential, azure_ai_project),
+            ViolenceEvaluator(credential, azure_ai_project, threshold=threshold),
+            SexualEvaluator(credential, azure_ai_project, threshold=threshold),
+            SelfHarmEvaluator(credential, azure_ai_project, threshold=threshold),
+            HateUnfairnessEvaluator(credential, azure_ai_project, threshold=threshold),
         ]
         super().__init__(evaluators=evaluators, **kwargs)