Skip to content

Commit e6dd52c

Browse files
MilesHollandneeduv
andauthored
cs eval accepts convos (Azure#37966)
* cs eval accepts convos * keep tests disabled * private parallel * incorporate api changes properly * run black * appease the mypy gods * Fix conversation typehint Co-authored-by: Neehar Duvvuri <[email protected]> * return type --------- Co-authored-by: Neehar Duvvuri <[email protected]>
1 parent d565fc4 commit e6dd52c

File tree

5 files changed

+100
-26
lines changed

5 files changed

+100
-26
lines changed

sdk/evaluation/azure-ai-evaluation/assets.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
"AssetsRepo": "Azure/azure-sdk-assets",
33
"AssetsRepoPrefixPath": "python",
44
"TagPrefix": "python/evaluation/azure-ai-evaluation",
5-
"Tag": "python/evaluation/azure-ai-evaluation_051cb9dfbd"
5+
"Tag": "python/evaluation/azure-ai-evaluation_73f2254a1c"
66
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,11 @@
2727

2828

2929
@overload
30-
def experimental(wrapped: Type[T]) -> Type[T]:
31-
...
30+
def experimental(wrapped: Type[T]) -> Type[T]: ...
3231

3332

3433
@overload
35-
def experimental(wrapped: Callable[P, T]) -> Callable[P, T]:
36-
...
34+
def experimental(wrapped: Callable[P, T]) -> Callable[P, T]: ...
3735

3836

3937
def experimental(wrapped: Union[Type[T], Callable[P, T]]) -> Union[Type[T], Callable[P, T]]:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py

Lines changed: 53 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44
from concurrent.futures import as_completed
5-
from typing import Callable, Dict, List, Union
5+
from typing import Callable, Dict, List, Union, Optional
6+
from typing_extensions import override
67

78
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
89

10+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
911
from azure.ai.evaluation._common._experimental import experimental
1012

1113
from ._hate_unfairness import HateUnfairnessEvaluator
@@ -15,7 +17,7 @@
1517

1618

1719
@experimental
18-
class ContentSafetyEvaluator:
20+
class ContentSafetyEvaluator(EvaluatorBase):
1921
"""
2022
Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
2123
@@ -24,8 +26,12 @@ class ContentSafetyEvaluator:
2426
:param azure_ai_project: The scope of the Azure AI project.
2527
It contains subscription id, resource group, and project name.
2628
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
27-
:param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
28-
Default is True.
29+
:param eval_last_turn: Whether to evaluate the last turn of a conversation. Default is False.
30+
:type eval_last_turn: bool
31+
:param kwargs: Additional arguments to pass to the evaluator.
32+
:type kwargs: Any
33+
:return: A function that evaluates content-safety metrics for "question-answering" scenario.
34+
:rtype: Callable
2935
3036
**Usage**
3137
@@ -62,41 +68,69 @@ class ContentSafetyEvaluator:
6268
}
6369
"""
6470

65-
def __init__(self, credential, azure_ai_project, parallel: bool = True):
66-
self._parallel = parallel
71+
def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs):
72+
super().__init__(eval_last_turn=eval_last_turn)
73+
self._parallel = kwargs.pop("parallel", True)
6774
self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
6875
ViolenceEvaluator(credential, azure_ai_project),
6976
SexualEvaluator(credential, azure_ai_project),
7077
SelfHarmEvaluator(credential, azure_ai_project),
7178
HateUnfairnessEvaluator(credential, azure_ai_project),
7279
]
7380

74-
def __call__(self, *, query: str, response: str, **kwargs):
81+
@override
82+
def __call__(
83+
self,
84+
*,
85+
query: Optional[str] = None,
86+
response: Optional[str] = None,
87+
conversation=None,
88+
**kwargs,
89+
):
90+
"""Evaluate a collection of content safety metrics for the given query/response pair or conversation.
91+
This inputs must supply either a query AND response, or a conversation, but not both.
92+
93+
:keyword query: The query to evaluate.
94+
:paramtype query: Optional[str]
95+
:keyword response: The response to evaluate.
96+
:paramtype response: Optional[str]
97+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
98+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
99+
to be dictionaries with keys "content", "role", and possibly "context".
100+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
101+
:return: The evaluation result.
102+
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
75103
"""
76-
Evaluates content-safety metrics for "question-answering" scenario.
77-
78-
:keyword query: The query to be evaluated.
79-
:paramtype query: str
80-
:keyword response: The response to be evaluated.
81-
:paramtype response: str
82-
:keyword parallel: Whether to evaluate in parallel.
83-
:paramtype parallel: bool
84-
:return: The scores for content-safety.
85-
:rtype: Dict[str, Union[str, float]]
104+
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
105+
106+
@override
107+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
108+
"""Perform the evaluation using the Azure AI RAI service.
109+
The exact evaluation performed is determined by the evaluation metric supplied
110+
by the child class initializer.
111+
112+
:param eval_input: The input to the evaluation function.
113+
:type eval_input: Dict
114+
:return: The evaluation result.
115+
:rtype: Dict
86116
"""
117+
query = eval_input.get("query", None)
118+
response = eval_input.get("response", None)
119+
conversation = eval_input.get("conversation", None)
87120
results: Dict[str, Union[str, float]] = {}
88121
if self._parallel:
89122
with ThreadPoolExecutor() as executor:
123+
# pylint: disable=no-value-for-parameter
90124
futures = {
91-
executor.submit(evaluator, query=query, response=response, **kwargs): evaluator
125+
executor.submit(query=query, response=response, conversation=conversation): evaluator
92126
for evaluator in self._evaluators
93127
}
94128

95129
for future in as_completed(futures):
96130
results.update(future.result())
97131
else:
98132
for evaluator in self._evaluators:
99-
result = evaluator(query=query, response=response, **kwargs)
133+
result = evaluator(query=query, response=response, conversation=conversation)
100134
results.update(result)
101135

102136
return results

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,46 @@ def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
357357
assert score["hate_unfairness_score"] < 1.0
358358
assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
359359

360+
def test_composite_evaluator_content_safety_with_conversation(self, project_scope, azure_cred, simple_conversation):
361+
safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=False)
362+
score = safety_eval(
363+
conversation=simple_conversation,
364+
)
365+
366+
assert score is not None
367+
368+
assert score["violence_score"] < 1.0
369+
assert score["sexual_score"] < 1.0
370+
assert score["self_harm_score"] < 1.0
371+
assert score["hate_unfairness_score"] < 1.0
372+
373+
per_turn = score["evaluation_per_turn"]
374+
375+
assert per_turn["violence"][0] == "Very low"
376+
assert per_turn["violence"][1] == "Very low"
377+
assert per_turn["violence_score"][0] < 1.0
378+
assert per_turn["violence_score"][1] < 1.0
379+
assert per_turn["violence_reason"][0], "violence_reason must not be None or empty."
380+
assert per_turn["violence_reason"][1], "violence_reason must not be None or empty."
381+
assert per_turn["sexual"][0] == "Very low"
382+
assert per_turn["sexual"][1] == "Very low"
383+
assert per_turn["sexual_score"][0] < 1.0
384+
assert per_turn["sexual_score"][1] < 1.0
385+
assert per_turn["sexual_reason"][0], "sexual_reason must not be None or empty."
386+
assert per_turn["sexual_reason"][1], "sexual_reason must not be None or empty."
387+
assert per_turn["self_harm"][0] == "Very low"
388+
assert per_turn["self_harm"][1] == "Very low"
389+
assert per_turn["self_harm_score"][0] < 1.0
390+
assert per_turn["self_harm_score"][1] < 1.0
391+
assert per_turn["self_harm_reason"][0], "self_harm_reason must not be None or empty."
392+
assert per_turn["self_harm_reason"][1], "self_harm_reason must not be None or empty."
393+
assert per_turn["hate_unfairness"][0] == "Very low"
394+
assert per_turn["hate_unfairness"][1] == "Very low"
395+
assert per_turn["hate_unfairness_score"][0] < 1.0
396+
assert per_turn["hate_unfairness_score"][1] < 1.0
397+
assert per_turn["hate_unfairness_reason"][0], "hate_unfairness_reason must not be None or empty."
398+
assert per_turn["hate_unfairness_reason"][1], "hate_unfairness_reason must not be None or empty."
399+
360400
def test_protected_material_evaluator(self, project_scope, azure_cred, simple_conversation):
361401
ip_eval = ProtectedMaterialEvaluator(azure_cred, project_scope)
362402
good_result = ip_eval(

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,14 +167,16 @@ def test_evaluate_with_relative_data_path(self, model_config):
167167

168168
@pytest.mark.azuretest
169169
@pytest.mark.skip(reason="Temporary skip to merge 37201, will re-enable in subsequent pr")
170-
def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file):
170+
def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, azure_cred):
171171
input_data = pd.read_json(data_file, lines=True)
172172

173173
# CS evaluator tries to store the credential, which breaks multiprocessing at
174174
# pickling stage. So we pass None for credential and let child evals
175175
# generate a default credential at runtime.
176176
# Internal Parallelism is also disabled to avoid faulty recordings.
177-
content_safety_eval = ContentSafetyEvaluator(project_scope, credential=None, parallel=False)
177+
content_safety_eval = ContentSafetyEvaluator(
178+
azure_ai_project=project_scope, credential=azure_cred, parallel=False
179+
)
178180

179181
# run the evaluation
180182
result = evaluate(

0 commit comments

Comments
 (0)