Skip to content

Commit c4b1d7c

Browse files
authored
Converge multi modal evals (Part 1). [Read PR description for details on Part 2] (Azure#38778)
* initial-commit * Adding multi-modal in conv based Evals * lint fix * asset * Test fix * asset * fix test * Fix * adding new tests * fix test * disable test * Msg fix * Msg fix * Adding localtest back
1 parent e3e4f7d commit c4b1d7c

File tree

9 files changed

+301
-60
lines changed

9 files changed

+301
-60
lines changed

sdk/evaluation/azure-ai-evaluation/assets.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
"AssetsRepo": "Azure/azure-sdk-assets",
33
"AssetsRepoPrefixPath": "python",
44
"TagPrefix": "python/evaluation/azure-ai-evaluation",
5-
"Tag": "python/evaluation/azure-ai-evaluation_e708c75299"
5+
"Tag": "python/evaluation/azure-ai-evaluation_08351329d3"
66
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from azure.ai.evaluation._common.math import list_mean
1313
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
1414
from azure.ai.evaluation._common.utils import remove_optional_singletons
15+
from azure.ai.evaluation._model_configurations import Conversation
1516

1617
P = ParamSpec("P")
1718
T = TypeVar("T")
@@ -202,6 +203,59 @@ def converter(conversation: Dict) -> List[DerivedEvalInput]:
202203

203204
return converter
204205

206+
def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
207+
"""Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
208+
This uses the inputs derived from the _derive_singleton_inputs function to determine which
209+
aspects of a conversation ought to be extracted.
210+
211+
:return: The function that will be used to convert conversations to evaluable inputs.
212+
:rtype: Callable
213+
"""
214+
215+
def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
216+
messages = cast(List[Dict[str, Any]], conversation["messages"])
217+
# Extract user messages, assistant messages from conversation
218+
user_messages: List[Dict[str, Any]] = []
219+
assistant_messages: List[Dict[str, Any]] = []
220+
system_messages: List[Dict[str, Any]] = []
221+
222+
# Convert conversation slice into queries and responses.
223+
# Assume that 'user' role is asking queries and 'assistant' role is responding.
224+
if self._eval_last_turn and len(messages) > 1:
225+
messages = messages[-2:]
226+
227+
for each_turn in messages:
228+
role = each_turn["role"]
229+
if role == "user":
230+
user_messages.append(each_turn)
231+
elif role == "assistant":
232+
assistant_messages.append(each_turn)
233+
elif role == "system":
234+
system_messages.append(each_turn)
235+
236+
# validation
237+
if len(user_messages) != len(assistant_messages):
238+
raise EvaluationException(
239+
message="Mismatched number of user and assistant messages.",
240+
internal_message=("Mismatched number of user and assistant messages."),
241+
)
242+
if len(assistant_messages) > 1:
243+
raise EvaluationException(
244+
message="Conversation can have only one assistant message.",
245+
internal_message=("Conversation can have only one assistant message."),
246+
)
247+
eval_conv_inputs = []
248+
for user_msg, assist_msg in zip(user_messages, assistant_messages):
249+
conv_messages = []
250+
if len(system_messages) == 1:
251+
conv_messages.append(system_messages[0])
252+
conv_messages.append(user_msg)
253+
conv_messages.append(assist_msg)
254+
eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)})
255+
return eval_conv_inputs
256+
257+
return multi_modal_converter
258+
205259
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
206260
"""Convert an arbitrary input into a list of inputs for evaluators.
207261
It is assumed that evaluators generally make use of their inputs in one of two ways.
@@ -210,7 +264,7 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
210264
values.
211265
212266
The self._singleton_inputs list assigned during initialization is used to find and extract
213-
singleton keywords, and self._allow_converssation_input is used to determine if a conversation
267+
singleton keywords, and self._allow_conversation_input is used to determine if a conversation
214268
is a valid input.
215269
216270
If both conversations and singletons are allowed, the function will raise an exception if both
@@ -241,6 +295,8 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
241295
)
242296
# Handle Conversation
243297
if conversation is not None:
298+
if self.is_multi_modal_conversation(conversation):
299+
return self._derive_multi_modal_conversation_converter()(conversation)
244300
return self._derive_conversation_converter()(conversation)
245301
# Handle Singletons
246302
required_singletons = remove_optional_singletons(self, singletons)
@@ -255,6 +311,20 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
255311
target=ErrorTarget.CONVERSATION,
256312
)
257313

314+
def is_multi_modal_conversation(self, conversation: Dict) -> bool:
315+
if "messages" not in conversation:
316+
return False
317+
messages = conversation["messages"]
318+
if not isinstance(messages, list):
319+
return False
320+
for message in messages:
321+
if "content" in message:
322+
content = message.get("content", "")
323+
if isinstance(content, list):
324+
if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
325+
return True
326+
return False
327+
258328
def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
259329
"""Aggregate the evaluation results of each conversation turn into a single result.
260330

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from typing_extensions import override
1111

1212
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
13+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
1314
from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
1415
from . import EvaluatorBase
1516

@@ -71,6 +72,14 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
7172
:return: The evaluation result.
7273
:rtype: Dict
7374
"""
75+
if "query" not in eval_input and "response" not in eval_input:
76+
raise EvaluationException(
77+
message="Only text conversation inputs are supported.",
78+
internal_message="Only text conversation inputs are supported.",
79+
blame=ErrorBlame.USER_ERROR,
80+
category=ErrorCategory.INVALID_VALUE,
81+
target=ErrorTarget.CONVERSATION,
82+
)
7483
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
7584

7685
score = math.nan

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@
1111
Tasks,
1212
_InternalAnnotationTasks,
1313
)
14-
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
14+
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
1515
from azure.ai.evaluation._common.utils import validate_azure_ai_project
1616
from azure.ai.evaluation._exceptions import EvaluationException
17+
from azure.ai.evaluation._common.utils import validate_conversation
1718
from azure.core.credentials import TokenCredential
1819

1920
from . import EvaluatorBase
@@ -81,6 +82,36 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
8182
:return: The evaluation result.
8283
:rtype: Dict
8384
"""
85+
if "query" in eval_input and "response" in eval_input:
86+
return await self._evaluate_query_response(eval_input)
87+
88+
conversation = eval_input.get("conversation", None)
89+
return await self._evaluate_conversation(conversation)
90+
91+
async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
92+
"""
93+
Evaluates content according to this evaluator's metric.
94+
:keyword conversation: The conversation contains list of messages to be evaluated.
95+
Each message should have "role" and "content" keys.
96+
97+
:param conversation: The conversation to evaluate.
98+
:type conversation: ~azure.ai.evaluation.Conversation
99+
:return: The evaluation score computation based on the Content Safety metric (self.metric).
100+
:rtype: Dict[str, Union[float, str]]
101+
"""
102+
# validate inputs
103+
validate_conversation(conversation)
104+
messages = conversation["messages"]
105+
# Run score computation based on supplied metric.
106+
result = await evaluate_with_rai_service_multimodal(
107+
messages=messages,
108+
metric_name=self._eval_metric,
109+
project_scope=self._azure_ai_project,
110+
credential=self._credential,
111+
)
112+
return result
113+
114+
async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
84115
query = eval_input.get("query", None)
85116
response = eval_input.get("response", None)
86117
if query is None or response is None:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
@experimental
2121
class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
2222
"""
23-
Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
23+
Initialize a content safety evaluator configured to evaluate content safety metrics for QA scenario.
2424
2525
:param credential: The credential for connecting to Azure AI project. Required
2626
:type credential: ~azure.core.credentials.TokenCredential

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py

Lines changed: 95 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -543,8 +543,17 @@ def test_groundedness_pro_evaluator(self, project_scope, azure_cred, simple_conv
543543
convo_result["evaluation_per_turn"]["groundedness_pro_reason"]
544544
), "groundedness_pro_reason must not be None or empty."
545545

546-
def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_only(self, project_scope, azure_cred):
547-
evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
546+
@pytest.mark.parametrize(
547+
"evaluator_class",
548+
[
549+
(ContentSafetyMultimodalEvaluator),
550+
(ContentSafetyEvaluator),
551+
],
552+
)
553+
def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_only(
554+
self, project_scope, azure_cred, evaluator_class
555+
):
556+
evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
548557
conversation = {
549558
"messages": [
550559
{
@@ -591,10 +600,17 @@ def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_on
591600
assert score["hate_unfairness_score"] < 1.0
592601
assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
593602

603+
@pytest.mark.parametrize(
604+
"evaluator_class",
605+
[
606+
(ContentSafetyMultimodalEvaluator),
607+
(ContentSafetyEvaluator),
608+
],
609+
)
594610
def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_only_with_text_content(
595-
self, project_scope, azure_cred
611+
self, project_scope, azure_cred, evaluator_class
596612
):
597-
evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
613+
evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
598614
conversation = {
599615
"messages": [
600616
{
@@ -603,10 +619,10 @@ def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_on
603619
{"type": "text", "text": "This is a nature boardwalk at the University of Wisconsin-Madison."}
604620
],
605621
},
606-
{"role": "user", "content": "What is in this picture?"},
607622
{
608623
"role": "user",
609624
"content": [
625+
{"type": "text", "text": "What is in this picture?"},
610626
{
611627
"type": "image_url",
612628
"image_url": {
@@ -642,8 +658,17 @@ def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_on
642658
assert score["hate_unfairness_score"] < 1.0
643659
assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
644660

645-
def test_multimodal_evaluator_content_safety_json_image_urls_with_image_output(self, project_scope, azure_cred):
646-
evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
661+
@pytest.mark.parametrize(
662+
"evaluator_class",
663+
[
664+
(ContentSafetyMultimodalEvaluator),
665+
(ContentSafetyEvaluator),
666+
],
667+
)
668+
def test_multimodal_evaluator_content_safety_json_image_urls_with_image_output(
669+
self, project_scope, azure_cred, evaluator_class
670+
):
671+
evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
647672
conversation = {
648673
"messages": [
649674
{
@@ -683,15 +708,22 @@ def test_multimodal_evaluator_content_safety_json_image_urls_with_image_output(s
683708
assert score["hate_unfairness_score"] < 1.0
684709
assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
685710

686-
def test_multimodal_evaluator_content_safety_json_b64_image(self, project_scope, azure_cred):
711+
@pytest.mark.parametrize(
712+
"evaluator_class",
713+
[
714+
(ContentSafetyMultimodalEvaluator),
715+
(ContentSafetyEvaluator),
716+
],
717+
)
718+
def test_multimodal_evaluator_content_safety_json_b64_image(self, project_scope, azure_cred, evaluator_class):
687719
parent = pathlib.Path(__file__).parent.resolve()
688720
path = os.path.join(parent, "data")
689721
image_path = os.path.join(path, "image1.jpg")
690722

691723
with pathlib.Path(image_path).open("rb") as image_file:
692724
encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
693725

694-
evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
726+
evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
695727
conversation = {
696728
"messages": [
697729
{
@@ -734,8 +766,15 @@ def test_multimodal_evaluator_content_safety_json_b64_image(self, project_scope,
734766
assert score["hate_unfairness_score"] < 1.0
735767
assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
736768

737-
def test_multimodal_evaluator_content_safety_inference(self, project_scope, azure_cred):
738-
evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
769+
@pytest.mark.parametrize(
770+
"evaluator_class",
771+
[
772+
(ContentSafetyMultimodalEvaluator),
773+
(ContentSafetyEvaluator),
774+
],
775+
)
776+
def test_multimodal_evaluator_content_safety_inference(self, project_scope, azure_cred, evaluator_class):
777+
evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
739778
conversation = Conversation(
740779
messages=[
741780
SystemMessage(
@@ -776,8 +815,15 @@ def test_multimodal_evaluator_content_safety_inference(self, project_scope, azur
776815
assert score["hate_unfairness_score"] < 1.0
777816
assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
778817

779-
def test_multimodal_evaluator_violence_json(self, project_scope, azure_cred):
780-
evaluator = ViolenceMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
818+
@pytest.mark.parametrize(
819+
"evaluator_class",
820+
[
821+
(ViolenceMultimodalEvaluator),
822+
(ViolenceEvaluator),
823+
],
824+
)
825+
def test_multimodal_evaluator_violence_json(self, project_scope, azure_cred, evaluator_class):
826+
evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
781827
conversation = {
782828
"messages": [
783829
{
@@ -816,8 +862,15 @@ def test_multimodal_evaluator_violence_json(self, project_scope, azure_cred):
816862
assert score["violence_score"] < 1.0
817863
assert score["violence_reason"], "violence_reason must not be None or empty."
818864

819-
def test_multimodal_evaluator_sexual_json(self, project_scope, azure_cred):
820-
evaluator = SexualMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
865+
@pytest.mark.parametrize(
866+
"evaluator_class",
867+
[
868+
(SexualMultimodalEvaluator),
869+
(SexualEvaluator),
870+
],
871+
)
872+
def test_multimodal_evaluator_sexual_json(self, project_scope, azure_cred, evaluator_class):
873+
evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
821874
conversation = {
822875
"messages": [
823876
{
@@ -856,8 +909,15 @@ def test_multimodal_evaluator_sexual_json(self, project_scope, azure_cred):
856909
assert score["sexual_score"] < 1.0
857910
assert score["sexual_reason"], "sexual_reason must not be None or empty."
858911

859-
def test_multimodal_evaluator_hate_unfairness_json(self, project_scope, azure_cred):
860-
evaluator = HateUnfairnessMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
912+
@pytest.mark.parametrize(
913+
"evaluator_class",
914+
[
915+
(HateUnfairnessMultimodalEvaluator),
916+
(HateUnfairnessEvaluator),
917+
],
918+
)
919+
def test_multimodal_evaluator_hate_unfairness_json(self, project_scope, azure_cred, evaluator_class):
920+
evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
861921
conversation = {
862922
"messages": [
863923
{
@@ -896,8 +956,15 @@ def test_multimodal_evaluator_hate_unfairness_json(self, project_scope, azure_cr
896956
assert score["hate_unfairness_score"] < 1.0
897957
assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
898958

899-
def test_multimodal_evaluator_self_harm_json(self, project_scope, azure_cred):
900-
evaluator = SelfHarmMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
959+
@pytest.mark.parametrize(
960+
"evaluator_class",
961+
[
962+
(SelfHarmMultimodalEvaluator),
963+
(SelfHarmEvaluator),
964+
],
965+
)
966+
def test_multimodal_evaluator_self_harm_json(self, project_scope, azure_cred, evaluator_class):
967+
evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
901968
conversation = {
902969
"messages": [
903970
{
@@ -936,8 +1003,15 @@ def test_multimodal_evaluator_self_harm_json(self, project_scope, azure_cred):
9361003
assert score["self_harm_score"] < 1.0
9371004
assert score["self_harm_reason"], "self_harm_reason must not be None or empty."
9381005

939-
def test_multimodal_evaluator_protected_material_json(self, project_scope, azure_cred):
940-
evaluator = ProtectedMaterialMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
1006+
@pytest.mark.parametrize(
1007+
"evaluator_class",
1008+
[
1009+
(ProtectedMaterialMultimodalEvaluator),
1010+
(ProtectedMaterialEvaluator),
1011+
],
1012+
)
1013+
def test_multimodal_evaluator_protected_material_json(self, project_scope, azure_cred, evaluator_class):
1014+
evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
9411015
conversation = {
9421016
"messages": [
9431017
{

0 commit comments

Comments
 (0)