Content safety evals aggregate max from conversations (Azure#39083)

MilesHolland · web-flow · commit 7f904a325d0d · 2025-01-22T13:06:06.000-05:00
* add convo agg type, and have harm evals use max

* analysis

* correct enum name in docs

* refactor checked enum into function field

* cl and analysis

* change enum name and update CL

* change function names to private, allow agg type retrieval

* PR comments

* test serialization

* CL

* CI adjustment

* try again

* perf

* skip perf

* remove skip
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -11,10 +11,15 @@
 - Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Studio.
 - Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
 - Fixed the non adversarial simulator to run in task-free mode
+- Content safety evaluators (violence, self harm, sexual, hate/unfairness) return the maximum result as the
+  main score when aggregating per-turn evaluations from a conversation into an overall
+  evaluation score. Other conversation-capable evaluators still default to a mean for aggregation.
 
 ### Other Changes
 - Changed minimum required python version to use this package from 3.8 to 3.9
 - Stop dependency on the local promptflow service. No promptflow service will automatically start when running evaluation.
+- Evaluators internally allow for custom aggregation. However, this causes serialization failures if evaluated while the
+  environment variable `AI_EVALS_BATCH_USE_ASYNC` is set to false.
 
 ## 1.1.0 (2024-12-12)
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
@@ -42,6 +42,7 @@
     Message,
     OpenAIModelConfiguration,
 )
+from ._constants import AggregationType
 
 __all__ = [
     "evaluate",
@@ -79,4 +80,5 @@
     "SexualMultimodalEvaluator",
     "ViolenceMultimodalEvaluator",
     "ProtectedMaterialMultimodalEvaluator",
+    "AggregationType",
 ]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py
@@ -1,7 +1,9 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import enum
 from typing import Literal
+from azure.ai.evaluation._common._experimental import experimental
 
 
 class EvaluationMetrics:
@@ -57,6 +59,22 @@ class EvaluationRunProperties:
     EVALUATION_SDK = "_azureml.evaluation_sdk_name"
 
 
+@experimental
+class AggregationType(enum.Enum):
+    """Defines how numeric evaluation results should be aggregated
+    to produce a single value. Used by individual evaluators to combine per-turn results for
+    a conversation-based input. In general, wherever this enum is used, it is also possible
+    to directly assign the underlying aggregation function for more complex use cases.
+    The 'custom' value is generally not an acceptable input, and should only be used as an output
+    to indicate that a custom aggregation function has been injected."""
+
+    MEAN = "mean"
+    MAX = "max"
+    MIN = "min"
+    SUM = "sum"
+    CUSTOM = "custom"
+
+
 DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
 
 CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -4,15 +4,18 @@
 
 import inspect
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
+from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
 
 from promptflow._utils.async_utils import async_run_allowing_running_loop
 from typing_extensions import ParamSpec, TypeAlias, get_overloads
 
-from azure.ai.evaluation._common.math import list_mean
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._common.utils import remove_optional_singletons
+from azure.ai.evaluation._constants import AggregationType
 from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._common._experimental import experimental
+
+from ._conversation_aggregators import GetAggregator, GetAggregatorType
 
 P = ParamSpec("P")
 T = TypeVar("T")
@@ -70,6 +73,13 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
     :type not_singleton_inputs: List[str]
     :param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
     :type eval_last_turn: bool
+    :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
+        to produce a single result.
+        Default is ~azure.ai.evaluation.AggregationType.MEAN.
+    :type conversation_aggregation_type: ~azure.ai.evaluation.AggregationType
+    :param conversation_aggregator_override: A function that will be used to aggregate per-turn results. If provided,
+        overrides the standard aggregator implied by conversation_aggregation_type. None by default.
+    :type conversation_aggregator_override: Optional[Callable[[List[float]], float]]
     """
 
     # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
@@ -81,11 +91,17 @@ def __init__(
         *,
         not_singleton_inputs: List[str] = ["conversation", "kwargs"],
         eval_last_turn: bool = False,
+        conversation_aggregation_type: AggregationType = AggregationType.MEAN,
+        conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
     ):
         self._not_singleton_inputs = not_singleton_inputs
         self._eval_last_turn = eval_last_turn
         self._singleton_inputs = self._derive_singleton_inputs()
         self._async_evaluator = AsyncEvaluatorBase(self._real_call)
+        self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
+        if conversation_aggregator_override is not None:
+            # Type ignore since we already checked for None, but mypy doesn't know that.
+            self._conversation_aggregation_function = conversation_aggregator_override  # type: ignore[assignment]
 
     # This needs to be overridden just to change the function header into something more informative,
     # and to be able to add a more specific docstring. The actual function contents should just be
@@ -359,7 +375,7 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]])
         # Find and average all numeric values
         for metric, values in evaluation_per_turn.items():
             if all(isinstance(value, (int, float)) for value in values):
-                aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
+                aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values))
         # Slap the per-turn results back in.
         aggregated["evaluation_per_turn"] = evaluation_per_turn
         return aggregated
@@ -387,10 +403,51 @@ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], Aggrega
         # Otherwise, aggregate results.
         return self._aggregate_results(per_turn_results=per_turn_results)
 
+    # ~~~ METHODS THAT SHOULD NOT BE OVERRIDDEN BY CHILDREN~~~``
+
     @final
     def _to_async(self) -> "AsyncEvaluatorBase":
         return self._async_evaluator
 
+    @experimental
+    @final
+    def _set_conversation_aggregation_type(self, conversation_aggregation_type: AggregationType) -> None:
+        """Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for
+        multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a
+        multi-turn conversation into a single top-level result.
+
+        :param conversation_aggregation_type: The type of aggregation to perform on the per-turn
+            results of a conversation to produce a single result.
+        :type conversation_aggregation_type: ~azure.ai.evaluation.AggregationType
+        """
+        self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
+
+    @experimental
+    @final
+    def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None:
+        """Set the conversation aggregator function directly. This function will be applied to all numeric outputs
+        of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per
+        evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not
+        suit your needs, but use with caution.
+
+        :param aggregator: The function to use to aggregate per-turn results.
+        :type aggregator: Callable[[List[float]], float]
+        """
+        self._conversation_aggregation_function = aggregator
+
+    @experimental
+    @final
+    def _get_conversation_aggregator_type(self) -> AggregationType:
+        """Get the current conversation aggregation type used by this evaluator. This refers to the
+        method used when a single input produces multiple evaluation results (ex: when a multi-turn conversation
+        is inputted into an evaluator that evaluates each turn individually). The individual inputs
+        are combined by the function implied here to produce a single overall result.
+
+        :return: The conversation aggregation type.
+        :rtype: ~azure.ai.evaluation.AggregationType
+        """
+        return GetAggregatorType(self._conversation_aggregation_function)
+
 
 class AsyncEvaluatorBase:
     """The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
@@ -15,6 +15,7 @@
 from azure.ai.evaluation._common.utils import validate_azure_ai_project
 from azure.ai.evaluation._exceptions import EvaluationException
 from azure.ai.evaluation._common.utils import validate_conversation
+from azure.ai.evaluation._constants import AggregationType
 from azure.core.credentials import TokenCredential
 
 from . import EvaluatorBase
@@ -35,6 +36,10 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
         when this occurs. Default is False, resulting full conversation evaluation and aggregation.
     :type eval_last_turn: bool
+    :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
+        to produce a single result.
+        Default is ~azure.ai.evaluation.AggregationType.MEAN.
+    :type conversation_aggregation_type: ~azure.ai.evaluation.AggregationType
     """
 
     @override
@@ -44,8 +49,9 @@ def __init__(
         azure_ai_project: dict,
         credential: TokenCredential,
         eval_last_turn: bool = False,
+        conversation_aggregation_type: AggregationType = AggregationType.MEAN,
     ):
-        super().__init__(eval_last_turn=eval_last_turn)
+        super().__init__(eval_last_turn=eval_last_turn, conversation_aggregation_type=conversation_aggregation_type)
         self._eval_metric = eval_metric
         self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
         self._credential = credential
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py
@@ -0,0 +1,49 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+from typing import Callable, List
+from azure.ai.evaluation._common.math import list_mean
+from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._constants import AggregationType
+
+
+def GetAggregator(aggregation_type: AggregationType) -> Callable[[List[float]], float]:
+    if aggregation_type == AggregationType.SUM:
+        return sum
+    if aggregation_type == AggregationType.MEAN:
+        return list_mean
+    if aggregation_type == AggregationType.MAX:
+        return max
+    if aggregation_type == AggregationType.MIN:
+        return min
+    if aggregation_type == AggregationType.CUSTOM:
+        msg = (
+            "Cannot 'get' aggregator function associated with custom aggregation enum."
+            + " This enum value should only be outputted as an indicator of an injected"
+            + " aggregation function, not inputted directly"
+        )
+        raise EvaluationException(
+            message=msg,
+            blame=ErrorBlame.UNKNOWN,
+            category=ErrorCategory.INVALID_VALUE,
+            target=ErrorTarget.EVALUATE,
+        )
+    raise EvaluationException(
+        message=f"Unaccounted for aggregation type: {aggregation_type}",
+        blame=ErrorBlame.UNKNOWN,
+        category=ErrorCategory.INVALID_VALUE,
+        target=ErrorTarget.EVALUATE,
+    )
+
+
+def GetAggregatorType(aggregation_function: Callable) -> AggregationType:
+    if aggregation_function == sum:  # pylint: disable=comparison-with-callable
+        return AggregationType.SUM
+    if aggregation_function == list_mean:  # pylint: disable=comparison-with-callable
+        return AggregationType.MEAN
+    if aggregation_function == max:  # pylint: disable=comparison-with-callable
+        return AggregationType.MAX
+    if aggregation_function == min:  # pylint: disable=comparison-with-callable
+        return AggregationType.MIN
+    return AggregationType.CUSTOM
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
@@ -9,6 +9,7 @@
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._constants import AggregationType
 
 
 @experimental
@@ -71,6 +72,7 @@ def __init__(
             eval_metric=EvaluationMetrics.HATE_FAIRNESS,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            conversation_aggregation_type=AggregationType.MAX,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
@@ -9,6 +9,7 @@
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._constants import AggregationType
 
 
 @experimental
@@ -65,6 +66,7 @@ def __init__(
             eval_metric=EvaluationMetrics.SELF_HARM,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            conversation_aggregation_type=AggregationType.MAX,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
@@ -9,6 +9,7 @@
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._constants import AggregationType
 
 
 @experimental
@@ -67,6 +68,7 @@ def __init__(
             eval_metric=EvaluationMetrics.SEXUAL,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            conversation_aggregation_type=AggregationType.MAX,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
@@ -9,6 +9,7 @@
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._constants import AggregationType
 
 
 @experimental
@@ -67,6 +68,7 @@ def __init__(
             eval_metric=EvaluationMetrics.VIOLENCE,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            conversation_aggregation_type=AggregationType.MAX,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluate_test_data_conversation.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluate_test_data_conversation.jsonl
@@ -0,0 +1,2 @@
+{"conversation" : {"context" : "", "messages": [{"content": "What shape has 3 sides", "role" :"user", "context": null}, {"content": "A triangle", "role" :"assistant", "context": "The answer is a triangle."}, {"content": "Next, what shape has 4 sides", "role" :"user", "context": null}, {"content": "A square", "role" :"assistant", "context": "The answer is a square."}]}}
+{"conversation" : {"context" : "User wants to know about state capitals", "messages": [{"content": "What is the capital of Hawaii`''\"</>{}{{]", "role" :"user", "context": "User wants to know the capital of Hawaii"}, {"content": "Honolulu", "role" :"assistant", "context": "The answer is a Honolulu."}, {"content": "Ok, what is the capital of Massachusetts", "role" :"user", "context": "User wants to know the capital of Massachusetts."}, {"content": "Boston", "role" :"assistant", "context": "The answer is Boston."}]}}
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_performance.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_performance.py
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py

Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,7 @@`
`42`	`42`	`Message,`
`43`	`43`	`OpenAIModelConfiguration,`
`44`	`44`	`)`
	`45`	`+from ._constants import AggregationType`
`45`	`46`
`46`	`47`	`__all__ = [`
`47`	`48`	`"evaluate",`
`@@ -79,4 +80,5 @@`
`79`	`80`	`"SexualMultimodalEvaluator",`
`80`	`81`	`"ViolenceMultimodalEvaluator",`
`81`	`82`	`"ProtectedMaterialMultimodalEvaluator",`
	`83`	`+ "AggregationType",`
`82`	`84`	`]`