cs eval accepts convos (Azure#37966)

MilesHolland · needuv · web-flow · commit e6dd52c378de · 2024-10-21T18:47:51.000-04:00
* cs eval accepts convos

* keep tests disabled

* private parallel

* incorporate api changes properly

* run black

* appease the mypy gods

* Fix conversation typehint

Co-authored-by: Neehar Duvvuri &lt;40341266+needuv@users.noreply.github.com&gt;

* return type

---------

Co-authored-by: Neehar Duvvuri &lt;40341266+needuv@users.noreply.github.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_051cb9dfbd"
+  "Tag": "python/evaluation/azure-ai-evaluation_73f2254a1c"
 }
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/_experimental.py
@@ -27,13 +27,11 @@
 
 
 @overload
-def experimental(wrapped: Type[T]) -> Type[T]:
-    ...
+def experimental(wrapped: Type[T]) -> Type[T]: ...
 
 
 @overload
-def experimental(wrapped: Callable[P, T]) -> Callable[P, T]:
-    ...
+def experimental(wrapped: Callable[P, T]) -> Callable[P, T]: ...
 
 
 def experimental(wrapped: Union[Type[T], Callable[P, T]]) -> Union[Type[T], Callable[P, T]]:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -2,10 +2,12 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 from concurrent.futures import as_completed
-from typing import Callable, Dict, List, Union
+from typing import Callable, Dict, List, Union, Optional
+from typing_extensions import override
 
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
 
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
 from azure.ai.evaluation._common._experimental import experimental
 
 from ._hate_unfairness import HateUnfairnessEvaluator
@@ -15,7 +17,7 @@
 
 
 @experimental
-class ContentSafetyEvaluator:
+class ContentSafetyEvaluator(EvaluatorBase):
     """
     Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
 
@@ -24,8 +26,12 @@ class ContentSafetyEvaluator:
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
-        Default is True.
+    :param eval_last_turn: Whether to evaluate the last turn of a conversation. Default is False.
+    :type eval_last_turn: bool
+    :param kwargs: Additional arguments to pass to the evaluator.
+    :type kwargs: Any
+    :return: A function that evaluates content-safety metrics for "question-answering" scenario.
+    :rtype: Callable
 
     **Usage**
 
@@ -62,41 +68,69 @@ class ContentSafetyEvaluator:
         }
     """
 
-    def __init__(self, credential, azure_ai_project, parallel: bool = True):
-        self._parallel = parallel
+    def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs):
+        super().__init__(eval_last_turn=eval_last_turn)
+        self._parallel = kwargs.pop("parallel", True)
         self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
             ViolenceEvaluator(credential, azure_ai_project),
             SexualEvaluator(credential, azure_ai_project),
             SelfHarmEvaluator(credential, azure_ai_project),
             HateUnfairnessEvaluator(credential, azure_ai_project),
         ]
 
-    def __call__(self, *, query: str, response: str, **kwargs):
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
+        """Evaluate a collection of content safety metrics for the given query/response pair or conversation.
+        This inputs must supply either a query AND response, or a conversation, but not both.
+
+        :keyword query: The query to evaluate.
+        :paramtype query: Optional[str]
+        :keyword response: The response to evaluate.
+        :paramtype response: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The evaluation result.
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
         """
-        Evaluates content-safety metrics for "question-answering" scenario.
-
-        :keyword query: The query to be evaluated.
-        :paramtype query: str
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :keyword parallel: Whether to evaluate in parallel.
-        :paramtype parallel: bool
-        :return: The scores for content-safety.
-        :rtype: Dict[str, Union[str, float]]
+        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
+        """Perform the evaluation using the Azure AI RAI service.
+        The exact evaluation performed is determined by the evaluation metric supplied
+        by the child class initializer.
+
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
         """
+        query = eval_input.get("query", None)
+        response = eval_input.get("response", None)
+        conversation = eval_input.get("conversation", None)
         results: Dict[str, Union[str, float]] = {}
         if self._parallel:
             with ThreadPoolExecutor() as executor:
+                # pylint: disable=no-value-for-parameter
                 futures = {
-                    executor.submit(evaluator, query=query, response=response, **kwargs): evaluator
+                    executor.submit(query=query, response=response, conversation=conversation): evaluator
                     for evaluator in self._evaluators
                 }
 
                 for future in as_completed(futures):
                     results.update(future.result())
         else:
             for evaluator in self._evaluators:
-                result = evaluator(query=query, response=response, **kwargs)
+                result = evaluator(query=query, response=response, conversation=conversation)
                 results.update(result)
 
         return results
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py
@@ -357,6 +357,46 @@ def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
         assert score["hate_unfairness_score"] < 1.0
         assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."
 
+    def test_composite_evaluator_content_safety_with_conversation(self, project_scope, azure_cred, simple_conversation):
+        safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=False)
+        score = safety_eval(
+            conversation=simple_conversation,
+        )
+
+        assert score is not None
+
+        assert score["violence_score"] < 1.0
+        assert score["sexual_score"] < 1.0
+        assert score["self_harm_score"] < 1.0
+        assert score["hate_unfairness_score"] < 1.0
+
+        per_turn = score["evaluation_per_turn"]
+
+        assert per_turn["violence"][0] == "Very low"
+        assert per_turn["violence"][1] == "Very low"
+        assert per_turn["violence_score"][0] < 1.0
+        assert per_turn["violence_score"][1] < 1.0
+        assert per_turn["violence_reason"][0], "violence_reason must not be None or empty."
+        assert per_turn["violence_reason"][1], "violence_reason must not be None or empty."
+        assert per_turn["sexual"][0] == "Very low"
+        assert per_turn["sexual"][1] == "Very low"
+        assert per_turn["sexual_score"][0] < 1.0
+        assert per_turn["sexual_score"][1] < 1.0
+        assert per_turn["sexual_reason"][0], "sexual_reason must not be None or empty."
+        assert per_turn["sexual_reason"][1], "sexual_reason must not be None or empty."
+        assert per_turn["self_harm"][0] == "Very low"
+        assert per_turn["self_harm"][1] == "Very low"
+        assert per_turn["self_harm_score"][0] < 1.0
+        assert per_turn["self_harm_score"][1] < 1.0
+        assert per_turn["self_harm_reason"][0], "self_harm_reason must not be None or empty."
+        assert per_turn["self_harm_reason"][1], "self_harm_reason must not be None or empty."
+        assert per_turn["hate_unfairness"][0] == "Very low"
+        assert per_turn["hate_unfairness"][1] == "Very low"
+        assert per_turn["hate_unfairness_score"][0] < 1.0
+        assert per_turn["hate_unfairness_score"][1] < 1.0
+        assert per_turn["hate_unfairness_reason"][0], "hate_unfairness_reason must not be None or empty."
+        assert per_turn["hate_unfairness_reason"][1], "hate_unfairness_reason must not be None or empty."
+
     def test_protected_material_evaluator(self, project_scope, azure_cred, simple_conversation):
         ip_eval = ProtectedMaterialEvaluator(azure_cred, project_scope)
         good_result = ip_eval(
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py
@@ -167,14 +167,16 @@ def test_evaluate_with_relative_data_path(self, model_config):
 
     @pytest.mark.azuretest
     @pytest.mark.skip(reason="Temporary skip to merge 37201, will re-enable in subsequent pr")
-    def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file):
+    def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, azure_cred):
         input_data = pd.read_json(data_file, lines=True)
 
         # CS evaluator tries to store the credential, which breaks multiprocessing at
         # pickling stage. So we pass None for credential and let child evals
         # generate a default credential at runtime.
         # Internal Parallelism is also disabled to avoid faulty recordings.
-        content_safety_eval = ContentSafetyEvaluator(project_scope, credential=None, parallel=False)
+        content_safety_eval = ContentSafetyEvaluator(
+            azure_ai_project=project_scope, credential=azure_cred, parallel=False
+        )
 
         # run the evaluation
         result = evaluate(

Original file line number	Diff line number	Diff line change
`@@ -2,5 +2,5 @@`
`2`	`2`	`"AssetsRepo": "Azure/azure-sdk-assets",`
`3`	`3`	`"AssetsRepoPrefixPath": "python",`
`4`	`4`	`"TagPrefix": "python/evaluation/azure-ai-evaluation",`
`5`		`- "Tag": "python/evaluation/azure-ai-evaluation_051cb9dfbd"`
	`5`	`+ "Tag": "python/evaluation/azure-ai-evaluation_73f2254a1c"`
`6`	`6`	`}`