Skip to content

Commit 7f904a3

Browse files
authored
Content safety evals aggregate max from conversations (Azure#39083)
* add convo agg type, and have harm evals use max * analysis * correct enum name in docs * refactor checked enum into function field * cl and analysis * change enum name and update CL * change function names to private, allow agg type retrieval * PR comments * test serialization * CL * CI adjustment * try again * perf * skip perf * remove skip
1 parent d1ce446 commit 7f904a3

File tree

14 files changed

+309
-6
lines changed

14 files changed

+309
-6
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,15 @@
1111
- Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Studio.
1212
- Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
1313
- Fixed the non adversarial simulator to run in task-free mode
14+
- Content safety evaluators (violence, self harm, sexual, hate/unfairness) return the maximum result as the
15+
main score when aggregating per-turn evaluations from a conversation into an overall
16+
evaluation score. Other conversation-capable evaluators still default to a mean for aggregation.
1417

1518
### Other Changes
1619
- Changed minimum required python version to use this package from 3.8 to 3.9
1720
- Stop dependency on the local promptflow service. No promptflow service will automatically start when running evaluation.
21+
- Evaluators internally allow for custom aggregation. However, this causes serialization failures if evaluated while the
22+
environment variable `AI_EVALS_BATCH_USE_ASYNC` is set to false.
1823

1924
## 1.1.0 (2024-12-12)
2025

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
Message,
4343
OpenAIModelConfiguration,
4444
)
45+
from ._constants import AggregationType
4546

4647
__all__ = [
4748
"evaluate",
@@ -79,4 +80,5 @@
7980
"SexualMultimodalEvaluator",
8081
"ViolenceMultimodalEvaluator",
8182
"ProtectedMaterialMultimodalEvaluator",
83+
"AggregationType",
8284
]

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
# ---------------------------------------------------------
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
4+
import enum
45
from typing import Literal
6+
from azure.ai.evaluation._common._experimental import experimental
57

68

79
class EvaluationMetrics:
@@ -57,6 +59,22 @@ class EvaluationRunProperties:
5759
EVALUATION_SDK = "_azureml.evaluation_sdk_name"
5860

5961

62+
@experimental
63+
class AggregationType(enum.Enum):
64+
"""Defines how numeric evaluation results should be aggregated
65+
to produce a single value. Used by individual evaluators to combine per-turn results for
66+
a conversation-based input. In general, wherever this enum is used, it is also possible
67+
to directly assign the underlying aggregation function for more complex use cases.
68+
The 'custom' value is generally not an acceptable input, and should only be used as an output
69+
to indicate that a custom aggregation function has been injected."""
70+
71+
MEAN = "mean"
72+
MAX = "max"
73+
MIN = "min"
74+
SUM = "sum"
75+
CUSTOM = "custom"
76+
77+
6078
DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
6179

6280
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,18 @@
44

55
import inspect
66
from abc import ABC, abstractmethod
7-
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
7+
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
88

99
from promptflow._utils.async_utils import async_run_allowing_running_loop
1010
from typing_extensions import ParamSpec, TypeAlias, get_overloads
1111

12-
from azure.ai.evaluation._common.math import list_mean
1312
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
1413
from azure.ai.evaluation._common.utils import remove_optional_singletons
14+
from azure.ai.evaluation._constants import AggregationType
1515
from azure.ai.evaluation._model_configurations import Conversation
16+
from azure.ai.evaluation._common._experimental import experimental
17+
18+
from ._conversation_aggregators import GetAggregator, GetAggregatorType
1619

1720
P = ParamSpec("P")
1821
T = TypeVar("T")
@@ -70,6 +73,13 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
7073
:type not_singleton_inputs: List[str]
7174
:param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
7275
:type eval_last_turn: bool
76+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
77+
to produce a single result.
78+
Default is ~azure.ai.evaluation.AggregationType.MEAN.
79+
:type conversation_aggregation_type: ~azure.ai.evaluation.AggregationType
80+
:param conversation_aggregator_override: A function that will be used to aggregate per-turn results. If provided,
81+
overrides the standard aggregator implied by conversation_aggregation_type. None by default.
82+
:type conversation_aggregator_override: Optional[Callable[[List[float]], float]]
7383
"""
7484

7585
# ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
@@ -81,11 +91,17 @@ def __init__(
8191
*,
8292
not_singleton_inputs: List[str] = ["conversation", "kwargs"],
8393
eval_last_turn: bool = False,
94+
conversation_aggregation_type: AggregationType = AggregationType.MEAN,
95+
conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
8496
):
8597
self._not_singleton_inputs = not_singleton_inputs
8698
self._eval_last_turn = eval_last_turn
8799
self._singleton_inputs = self._derive_singleton_inputs()
88100
self._async_evaluator = AsyncEvaluatorBase(self._real_call)
101+
self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
102+
if conversation_aggregator_override is not None:
103+
# Type ignore since we already checked for None, but mypy doesn't know that.
104+
self._conversation_aggregation_function = conversation_aggregator_override # type: ignore[assignment]
89105

90106
# This needs to be overridden just to change the function header into something more informative,
91107
# and to be able to add a more specific docstring. The actual function contents should just be
@@ -359,7 +375,7 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]])
359375
# Find and average all numeric values
360376
for metric, values in evaluation_per_turn.items():
361377
if all(isinstance(value, (int, float)) for value in values):
362-
aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
378+
aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values))
363379
# Slap the per-turn results back in.
364380
aggregated["evaluation_per_turn"] = evaluation_per_turn
365381
return aggregated
@@ -387,10 +403,51 @@ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], Aggrega
387403
# Otherwise, aggregate results.
388404
return self._aggregate_results(per_turn_results=per_turn_results)
389405

406+
# ~~~ METHODS THAT SHOULD NOT BE OVERRIDDEN BY CHILDREN~~~``
407+
390408
@final
391409
def _to_async(self) -> "AsyncEvaluatorBase":
392410
return self._async_evaluator
393411

412+
@experimental
413+
@final
414+
def _set_conversation_aggregation_type(self, conversation_aggregation_type: AggregationType) -> None:
415+
"""Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for
416+
multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a
417+
multi-turn conversation into a single top-level result.
418+
419+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn
420+
results of a conversation to produce a single result.
421+
:type conversation_aggregation_type: ~azure.ai.evaluation.AggregationType
422+
"""
423+
self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
424+
425+
@experimental
426+
@final
427+
def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None:
428+
"""Set the conversation aggregator function directly. This function will be applied to all numeric outputs
429+
of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per
430+
evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not
431+
suit your needs, but use with caution.
432+
433+
:param aggregator: The function to use to aggregate per-turn results.
434+
:type aggregator: Callable[[List[float]], float]
435+
"""
436+
self._conversation_aggregation_function = aggregator
437+
438+
@experimental
439+
@final
440+
def _get_conversation_aggregator_type(self) -> AggregationType:
441+
"""Get the current conversation aggregation type used by this evaluator. This refers to the
442+
method used when a single input produces multiple evaluation results (ex: when a multi-turn conversation
443+
is inputted into an evaluator that evaluates each turn individually). The individual inputs
444+
are combined by the function implied here to produce a single overall result.
445+
446+
:return: The conversation aggregation type.
447+
:rtype: ~azure.ai.evaluation.AggregationType
448+
"""
449+
return GetAggregatorType(self._conversation_aggregation_function)
450+
394451

395452
class AsyncEvaluatorBase:
396453
"""The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from azure.ai.evaluation._common.utils import validate_azure_ai_project
1616
from azure.ai.evaluation._exceptions import EvaluationException
1717
from azure.ai.evaluation._common.utils import validate_conversation
18+
from azure.ai.evaluation._constants import AggregationType
1819
from azure.core.credentials import TokenCredential
1920

2021
from . import EvaluatorBase
@@ -35,6 +36,10 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
3536
aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
3637
when this occurs. Default is False, resulting full conversation evaluation and aggregation.
3738
:type eval_last_turn: bool
39+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
40+
to produce a single result.
41+
Default is ~azure.ai.evaluation.AggregationType.MEAN.
42+
:type conversation_aggregation_type: ~azure.ai.evaluation.AggregationType
3843
"""
3944

4045
@override
@@ -44,8 +49,9 @@ def __init__(
4449
azure_ai_project: dict,
4550
credential: TokenCredential,
4651
eval_last_turn: bool = False,
52+
conversation_aggregation_type: AggregationType = AggregationType.MEAN,
4753
):
48-
super().__init__(eval_last_turn=eval_last_turn)
54+
super().__init__(eval_last_turn=eval_last_turn, conversation_aggregation_type=conversation_aggregation_type)
4955
self._eval_metric = eval_metric
5056
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
5157
self._credential = credential
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
from typing import Callable, List
6+
from azure.ai.evaluation._common.math import list_mean
7+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
8+
from azure.ai.evaluation._constants import AggregationType
9+
10+
11+
def GetAggregator(aggregation_type: AggregationType) -> Callable[[List[float]], float]:
12+
if aggregation_type == AggregationType.SUM:
13+
return sum
14+
if aggregation_type == AggregationType.MEAN:
15+
return list_mean
16+
if aggregation_type == AggregationType.MAX:
17+
return max
18+
if aggregation_type == AggregationType.MIN:
19+
return min
20+
if aggregation_type == AggregationType.CUSTOM:
21+
msg = (
22+
"Cannot 'get' aggregator function associated with custom aggregation enum."
23+
+ " This enum value should only be outputted as an indicator of an injected"
24+
+ " aggregation function, not inputted directly"
25+
)
26+
raise EvaluationException(
27+
message=msg,
28+
blame=ErrorBlame.UNKNOWN,
29+
category=ErrorCategory.INVALID_VALUE,
30+
target=ErrorTarget.EVALUATE,
31+
)
32+
raise EvaluationException(
33+
message=f"Unaccounted for aggregation type: {aggregation_type}",
34+
blame=ErrorBlame.UNKNOWN,
35+
category=ErrorCategory.INVALID_VALUE,
36+
target=ErrorTarget.EVALUATE,
37+
)
38+
39+
40+
def GetAggregatorType(aggregation_function: Callable) -> AggregationType:
41+
if aggregation_function == sum: # pylint: disable=comparison-with-callable
42+
return AggregationType.SUM
43+
if aggregation_function == list_mean: # pylint: disable=comparison-with-callable
44+
return AggregationType.MEAN
45+
if aggregation_function == max: # pylint: disable=comparison-with-callable
46+
return AggregationType.MAX
47+
if aggregation_function == min: # pylint: disable=comparison-with-callable
48+
return AggregationType.MIN
49+
return AggregationType.CUSTOM

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from azure.ai.evaluation._common.constants import EvaluationMetrics
1010
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
1111
from azure.ai.evaluation._model_configurations import Conversation
12+
from azure.ai.evaluation._constants import AggregationType
1213

1314

1415
@experimental
@@ -71,6 +72,7 @@ def __init__(
7172
eval_metric=EvaluationMetrics.HATE_FAIRNESS,
7273
azure_ai_project=azure_ai_project,
7374
credential=credential,
75+
conversation_aggregation_type=AggregationType.MAX,
7476
)
7577

7678
@overload

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from azure.ai.evaluation._common.constants import EvaluationMetrics
1010
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
1111
from azure.ai.evaluation._model_configurations import Conversation
12+
from azure.ai.evaluation._constants import AggregationType
1213

1314

1415
@experimental
@@ -65,6 +66,7 @@ def __init__(
6566
eval_metric=EvaluationMetrics.SELF_HARM,
6667
azure_ai_project=azure_ai_project,
6768
credential=credential,
69+
conversation_aggregation_type=AggregationType.MAX,
6870
)
6971

7072
@overload

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from azure.ai.evaluation._common.constants import EvaluationMetrics
1010
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
1111
from azure.ai.evaluation._model_configurations import Conversation
12+
from azure.ai.evaluation._constants import AggregationType
1213

1314

1415
@experimental
@@ -67,6 +68,7 @@ def __init__(
6768
eval_metric=EvaluationMetrics.SEXUAL,
6869
azure_ai_project=azure_ai_project,
6970
credential=credential,
71+
conversation_aggregation_type=AggregationType.MAX,
7072
)
7173

7274
@overload

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from azure.ai.evaluation._common.constants import EvaluationMetrics
1010
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
1111
from azure.ai.evaluation._model_configurations import Conversation
12+
from azure.ai.evaluation._constants import AggregationType
1213

1314

1415
@experimental
@@ -67,6 +68,7 @@ def __init__(
6768
eval_metric=EvaluationMetrics.VIOLENCE,
6869
azure_ai_project=azure_ai_project,
6970
credential=credential,
71+
conversation_aggregation_type=AggregationType.MAX,
7072
)
7173

7274
@overload

0 commit comments

Comments
 (0)