Converge multi modal evals (Part 1). [Read PR description for details on Part 2] (Azure#38778)

w-javed · web-flow · commit c4b1d7c02870 · 2024-12-09T11:27:22.000-08:00
* initial-commit

* Adding multi-modal in conv based Evals

* lint fix

* asset

* Test fix

* asset

* fix test

* Fix

* adding new tests

* fix test

* disable test

* Msg fix

* Msg fix

* Adding localtest back
diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_e708c75299"
+  "Tag": "python/evaluation/azure-ai-evaluation_08351329d3"
 }
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -12,6 +12,7 @@
 from azure.ai.evaluation._common.math import list_mean
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._common.utils import remove_optional_singletons
+from azure.ai.evaluation._model_configurations import Conversation
 
 P = ParamSpec("P")
 T = TypeVar("T")
@@ -202,6 +203,59 @@ def converter(conversation: Dict) -> List[DerivedEvalInput]:
 
         return converter
 
+    def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
+        """Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
+        This uses the inputs derived from the _derive_singleton_inputs function to determine which
+        aspects of a conversation ought to be extracted.
+
+        :return: The function that will be used to convert conversations to evaluable inputs.
+        :rtype: Callable
+        """
+
+        def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
+            messages = cast(List[Dict[str, Any]], conversation["messages"])
+            # Extract user messages, assistant messages from conversation
+            user_messages: List[Dict[str, Any]] = []
+            assistant_messages: List[Dict[str, Any]] = []
+            system_messages: List[Dict[str, Any]] = []
+
+            # Convert conversation slice into queries and responses.
+            # Assume that 'user' role is asking queries and 'assistant' role is responding.
+            if self._eval_last_turn and len(messages) > 1:
+                messages = messages[-2:]
+
+            for each_turn in messages:
+                role = each_turn["role"]
+                if role == "user":
+                    user_messages.append(each_turn)
+                elif role == "assistant":
+                    assistant_messages.append(each_turn)
+                elif role == "system":
+                    system_messages.append(each_turn)
+
+            # validation
+            if len(user_messages) != len(assistant_messages):
+                raise EvaluationException(
+                    message="Mismatched number of user and assistant messages.",
+                    internal_message=("Mismatched number of user and assistant messages."),
+                )
+            if len(assistant_messages) > 1:
+                raise EvaluationException(
+                    message="Conversation can have only one assistant message.",
+                    internal_message=("Conversation can have only one assistant message."),
+                )
+            eval_conv_inputs = []
+            for user_msg, assist_msg in zip(user_messages, assistant_messages):
+                conv_messages = []
+                if len(system_messages) == 1:
+                    conv_messages.append(system_messages[0])
+                conv_messages.append(user_msg)
+                conv_messages.append(assist_msg)
+                eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)})
+            return eval_conv_inputs
+
+        return multi_modal_converter
+
     def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
         """Convert an arbitrary input into a list of inputs for evaluators.
         It is assumed that evaluators generally make use of their inputs in one of two ways.
@@ -210,7 +264,7 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
         values.
 
         The self._singleton_inputs list assigned during initialization is used to find and extract
-        singleton keywords, and self._allow_converssation_input is used to determine if a conversation
+        singleton keywords, and self._allow_conversation_input is used to determine if a conversation
         is a valid input.
 
         If both conversations and singletons are allowed, the function will raise an exception if both
@@ -241,6 +295,8 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
             )
         # Handle Conversation
         if conversation is not None:
+            if self.is_multi_modal_conversation(conversation):
+                return self._derive_multi_modal_conversation_converter()(conversation)
             return self._derive_conversation_converter()(conversation)
         # Handle Singletons
         required_singletons = remove_optional_singletons(self, singletons)
@@ -255,6 +311,20 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
             target=ErrorTarget.CONVERSATION,
         )
 
+    def is_multi_modal_conversation(self, conversation: Dict) -> bool:
+        if "messages" not in conversation:
+            return False
+        messages = conversation["messages"]
+        if not isinstance(messages, list):
+            return False
+        for message in messages:
+            if "content" in message:
+                content = message.get("content", "")
+                if isinstance(content, list):
+                    if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
+                        return True
+        return False
+
     def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
         """Aggregate the evaluation results of each conversation turn into a single result.
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
@@ -10,6 +10,7 @@
 from typing_extensions import override
 
 from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
 from . import EvaluatorBase
 
@@ -71,6 +72,14 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :return: The evaluation result.
         :rtype: Dict
         """
+        if "query" not in eval_input and "response" not in eval_input:
+            raise EvaluationException(
+                message="Only text conversation inputs are supported.",
+                internal_message="Only text conversation inputs are supported.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.CONVERSATION,
+            )
         llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
 
         score = math.nan
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py
@@ -11,9 +11,10 @@
     Tasks,
     _InternalAnnotationTasks,
 )
-from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
+from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
 from azure.ai.evaluation._common.utils import validate_azure_ai_project
 from azure.ai.evaluation._exceptions import EvaluationException
+from azure.ai.evaluation._common.utils import validate_conversation
 from azure.core.credentials import TokenCredential
 
 from . import EvaluatorBase
@@ -81,6 +82,36 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
         :return: The evaluation result.
         :rtype: Dict
         """
+        if "query" in eval_input and "response" in eval_input:
+            return await self._evaluate_query_response(eval_input)
+
+        conversation = eval_input.get("conversation", None)
+        return await self._evaluate_conversation(conversation)
+
+    async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
+        """
+        Evaluates content according to this evaluator's metric.
+        :keyword conversation: The conversation contains list of messages to be evaluated.
+            Each message should have "role" and "content" keys.
+
+        :param conversation: The conversation to evaluate.
+        :type conversation: ~azure.ai.evaluation.Conversation
+        :return: The evaluation score computation based on the Content Safety metric (self.metric).
+        :rtype: Dict[str, Union[float, str]]
+        """
+        # validate inputs
+        validate_conversation(conversation)
+        messages = conversation["messages"]
+        # Run score computation based on supplied metric.
+        result = await evaluate_with_rai_service_multimodal(
+            messages=messages,
+            metric_name=self._eval_metric,
+            project_scope=self._azure_ai_project,
+            credential=self._credential,
+        )
+        return result
+
+    async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
         query = eval_input.get("query", None)
         response = eval_input.get("response", None)
         if query is None or response is None:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -20,7 +20,7 @@
 @experimental
 class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
     """
-    Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
+    Initialize a content safety evaluator configured to evaluate content safety metrics for QA scenario.
 
     :param credential: The credential for connecting to Azure AI project. Required
     :type credential: ~azure.core.credentials.TokenCredential
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py
@@ -543,8 +543,17 @@ def test_groundedness_pro_evaluator(self, project_scope, azure_cred, simple_conv
             convo_result["evaluation_per_turn"]["groundedness_pro_reason"]
         ), "groundedness_pro_reason must not be None or empty."
 
-    def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_only(self, project_scope, azure_cred):
-        evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
+    @pytest.mark.parametrize(
+        "evaluator_class",
+        [
+            (ContentSafetyMultimodalEvaluator),
+            (ContentSafetyEvaluator),
+        ],
+    )
+    def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_only(
+        self, project_scope, azure_cred, evaluator_class
+    ):
+        evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
         conversation = {
             "messages": [
                 {
@@ -591,10 +600,17 @@ def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_on
         assert score["hate_unfairness_score"] < 1.0
         assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
 
+    @pytest.mark.parametrize(
+        "evaluator_class",
+        [
+            (ContentSafetyMultimodalEvaluator),
+            (ContentSafetyEvaluator),
+        ],
+    )
     def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_only_with_text_content(
-        self, project_scope, azure_cred
+        self, project_scope, azure_cred, evaluator_class
     ):
-        evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
+        evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
         conversation = {
             "messages": [
                 {
@@ -603,10 +619,10 @@ def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_on
                         {"type": "text", "text": "This is a nature boardwalk at the University of Wisconsin-Madison."}
                     ],
                 },
-                {"role": "user", "content": "What is in this picture?"},
                 {
                     "role": "user",
                     "content": [
+                        {"type": "text", "text": "What is in this picture?"},
                         {
                             "type": "image_url",
                             "image_url": {
@@ -642,8 +658,17 @@ def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_on
         assert score["hate_unfairness_score"] < 1.0
         assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
 
-    def test_multimodal_evaluator_content_safety_json_image_urls_with_image_output(self, project_scope, azure_cred):
-        evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
+    @pytest.mark.parametrize(
+        "evaluator_class",
+        [
+            (ContentSafetyMultimodalEvaluator),
+            (ContentSafetyEvaluator),
+        ],
+    )
+    def test_multimodal_evaluator_content_safety_json_image_urls_with_image_output(
+        self, project_scope, azure_cred, evaluator_class
+    ):
+        evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
         conversation = {
             "messages": [
                 {
@@ -683,15 +708,22 @@ def test_multimodal_evaluator_content_safety_json_image_urls_with_image_output(s
         assert score["hate_unfairness_score"] < 1.0
         assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
 
-    def test_multimodal_evaluator_content_safety_json_b64_image(self, project_scope, azure_cred):
+    @pytest.mark.parametrize(
+        "evaluator_class",
+        [
+            (ContentSafetyMultimodalEvaluator),
+            (ContentSafetyEvaluator),
+        ],
+    )
+    def test_multimodal_evaluator_content_safety_json_b64_image(self, project_scope, azure_cred, evaluator_class):
         parent = pathlib.Path(__file__).parent.resolve()
         path = os.path.join(parent, "data")
         image_path = os.path.join(path, "image1.jpg")
 
         with pathlib.Path(image_path).open("rb") as image_file:
             encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
 
-        evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
+        evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
         conversation = {
             "messages": [
                 {
@@ -734,8 +766,15 @@ def test_multimodal_evaluator_content_safety_json_b64_image(self, project_scope,
         assert score["hate_unfairness_score"] < 1.0
         assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
 
-    def test_multimodal_evaluator_content_safety_inference(self, project_scope, azure_cred):
-        evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
+    @pytest.mark.parametrize(
+        "evaluator_class",
+        [
+            (ContentSafetyMultimodalEvaluator),
+            (ContentSafetyEvaluator),
+        ],
+    )
+    def test_multimodal_evaluator_content_safety_inference(self, project_scope, azure_cred, evaluator_class):
+        evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
         conversation = Conversation(
             messages=[
                 SystemMessage(
@@ -776,8 +815,15 @@ def test_multimodal_evaluator_content_safety_inference(self, project_scope, azur
         assert score["hate_unfairness_score"] < 1.0
         assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
 
-    def test_multimodal_evaluator_violence_json(self, project_scope, azure_cred):
-        evaluator = ViolenceMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
+    @pytest.mark.parametrize(
+        "evaluator_class",
+        [
+            (ViolenceMultimodalEvaluator),
+            (ViolenceEvaluator),
+        ],
+    )
+    def test_multimodal_evaluator_violence_json(self, project_scope, azure_cred, evaluator_class):
+        evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
         conversation = {
             "messages": [
                 {
@@ -816,8 +862,15 @@ def test_multimodal_evaluator_violence_json(self, project_scope, azure_cred):
         assert score["violence_score"] < 1.0
         assert score["violence_reason"], "violence_reason must not be None or empty."
 
-    def test_multimodal_evaluator_sexual_json(self, project_scope, azure_cred):
-        evaluator = SexualMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
+    @pytest.mark.parametrize(
+        "evaluator_class",
+        [
+            (SexualMultimodalEvaluator),
+            (SexualEvaluator),
+        ],
+    )
+    def test_multimodal_evaluator_sexual_json(self, project_scope, azure_cred, evaluator_class):
+        evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
         conversation = {
             "messages": [
                 {
@@ -856,8 +909,15 @@ def test_multimodal_evaluator_sexual_json(self, project_scope, azure_cred):
         assert score["sexual_score"] < 1.0
         assert score["sexual_reason"], "sexual_reason must not be None or empty."
 
-    def test_multimodal_evaluator_hate_unfairness_json(self, project_scope, azure_cred):
-        evaluator = HateUnfairnessMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
+    @pytest.mark.parametrize(
+        "evaluator_class",
+        [
+            (HateUnfairnessMultimodalEvaluator),
+            (HateUnfairnessEvaluator),
+        ],
+    )
+    def test_multimodal_evaluator_hate_unfairness_json(self, project_scope, azure_cred, evaluator_class):
+        evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
         conversation = {
             "messages": [
                 {
@@ -896,8 +956,15 @@ def test_multimodal_evaluator_hate_unfairness_json(self, project_scope, azure_cr
         assert score["hate_unfairness_score"] < 1.0
         assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
 
-    def test_multimodal_evaluator_self_harm_json(self, project_scope, azure_cred):
-        evaluator = SelfHarmMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
+    @pytest.mark.parametrize(
+        "evaluator_class",
+        [
+            (SelfHarmMultimodalEvaluator),
+            (SelfHarmEvaluator),
+        ],
+    )
+    def test_multimodal_evaluator_self_harm_json(self, project_scope, azure_cred, evaluator_class):
+        evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
         conversation = {
             "messages": [
                 {
@@ -936,8 +1003,15 @@ def test_multimodal_evaluator_self_harm_json(self, project_scope, azure_cred):
         assert score["self_harm_score"] < 1.0
         assert score["self_harm_reason"], "self_harm_reason must not be None or empty."
 
-    def test_multimodal_evaluator_protected_material_json(self, project_scope, azure_cred):
-        evaluator = ProtectedMaterialMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope)
+    @pytest.mark.parametrize(
+        "evaluator_class",
+        [
+            (ProtectedMaterialMultimodalEvaluator),
+            (ProtectedMaterialEvaluator),
+        ],
+    )
+    def test_multimodal_evaluator_protected_material_json(self, project_scope, azure_cred, evaluator_class):
+        evaluator = evaluator_class(credential=azure_cred, azure_ai_project=project_scope)
         conversation = {
             "messages": [
                 {
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_sim_and_eval.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_sim_and_eval.py

Original file line number	Diff line number	Diff line change
`@@ -2,5 +2,5 @@`
`2`	`2`	`"AssetsRepo": "Azure/azure-sdk-assets",`
`3`	`3`	`"AssetsRepoPrefixPath": "python",`
`4`	`4`	`"TagPrefix": "python/evaluation/azure-ai-evaluation",`
`5`		`- "Tag": "python/evaluation/azure-ai-evaluation_e708c75299"`
	`5`	`+ "Tag": "python/evaluation/azure-ai-evaluation_08351329d3"`
`6`	`6`	`}`