Fix __call__ Overload Types (Azure#38238)

needuv · web-flow · commit c130cf4ac235 · 2024-11-01T20:15:07.000Z
* Fix __call__ overload issues

* fix typing issue

* make query required for groundednesspro

* fix a malformatted docstring

* fix some type hints and remove eval_last_turn from evaluators

* fix optional import

* comment out eval last turn section
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
@@ -93,10 +93,10 @@ def __call__(  # pylint: disable=docstring-missing-param
         or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
         turns, the evaluator will aggregate the results of each turn.
 
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: Optional[str]
-        :keyword context: The context to be evaluated.
-        :paramtype context: Optional[str]
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages". Conversation turns are expected
             to be dictionaries with keys "content" and "role".
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py
@@ -27,8 +27,6 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param eval_last_turn: Whether to evaluate the last turn of a conversation. Default is False.
-    :type eval_last_turn: bool
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
     :return: A function that evaluates content-safety metrics for "question-answering" scenario.
@@ -69,8 +67,8 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
     """
 
     # TODO address 3579092 to re-enabled parallel evals.
-    def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs):
-        super().__init__(eval_last_turn=eval_last_turn)
+    def __init__(self, credential, azure_ai_project, **kwargs):
+        super().__init__()
         self._parallel = kwargs.pop("_parallel", False)
         self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
             ViolenceEvaluator(credential, azure_ai_project),
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py
@@ -50,13 +50,11 @@ def __init__(
         self,
         credential,
         azure_ai_project,
-        eval_last_turn: bool = False,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.HATE_FAIRNESS,
             azure_ai_project=azure_ai_project,
             credential=credential,
-            eval_last_turn=eval_last_turn,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py
@@ -50,13 +50,11 @@ def __init__(
         self,
         credential,
         azure_ai_project,
-        eval_last_turn: bool = False,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.SELF_HARM,
             azure_ai_project=azure_ai_project,
             credential=credential,
-            eval_last_turn=eval_last_turn,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py
@@ -50,13 +50,11 @@ def __init__(
         self,
         credential,
         azure_ai_project,
-        eval_last_turn: bool = False,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.SEXUAL,
             azure_ai_project=azure_ai_project,
             credential=credential,
-            eval_last_turn=eval_last_turn,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py
@@ -50,13 +50,11 @@ def __init__(
         self,
         credential,
         azure_ai_project,
-        eval_last_turn: bool = False,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.VIOLENCE,
             azure_ai_project=azure_ai_project,
             credential=credential,
-            eval_last_turn=eval_last_turn,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py
@@ -55,13 +55,11 @@ def __init__(
         self,
         credential,
         azure_ai_project,
-        eval_last_turn: bool = False,
     ):
         super().__init__(
             eval_metric=_InternalEvaluationMetrics.ECI,
             azure_ai_project=azure_ai_project,
             credential=credential,
-            eval_last_turn=eval_last_turn,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
@@ -55,7 +55,7 @@ def __call__(
         *,
         response: str,
     ) -> Dict[str, Union[str, float]]:
-        """Evaluate fluency in given query/response
+        """Evaluate fluency in given response
 
         :keyword response: The response to be evaluated.
         :paramtype response: str
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py
@@ -56,13 +56,11 @@ def __init__(
         self,
         credential,
         azure_ai_project,
-        eval_last_turn: bool = False,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
             azure_ai_project=azure_ai_project,
             credential=credential,
-            eval_last_turn=eval_last_turn,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
@@ -180,14 +180,14 @@ def __call__(
         self,
         *,
         conversation: Conversation,
-    ) -> Dict[str, Union[float, Dict[str, List[float]]]]:
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluates retrieval for a for a multi-turn evaluation. If the conversation has more than one turn,
         the evaluator will aggregate the results of each turn.
 
         :keyword conversation: The conversation to be evaluated.
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The scores for Chat scenario.
-        :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
 
     def __call__(self, *args, **kwargs):  # pylint: disable=docstring-missing-param
@@ -202,7 +202,7 @@ def __call__(self, *args, **kwargs):  # pylint: disable=docstring-missing-param
         :keyword conversation: The conversation to be evaluated.
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The scores for Chat scenario.
-        :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        :rtype: :rtype: Dict[str, Union[float, Dict[str, List[str, float]]]]
         """
         query = kwargs.pop("query", None)
         context = kwargs.pop("context", None)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import List, Optional, Union, Dict
+from typing import List, Union, Dict
 from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
@@ -105,18 +105,18 @@ def __init__(
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        context: Optional[str] = None,
+        response: str,
+        context: str,
+        query: str,
     ) -> Dict[str, Union[str, bool]]:
         """Evaluate groundedness for a given query/response/context
 
-        :keyword query: The query to be evaluated.
-        :paramtype query: Optional[str]
         :keyword response: The response to be evaluated.
-        :paramtype response: Optional[str]
+        :paramtype response: str
         :keyword context: The context to be evaluated.
-        :paramtype context: Optional[str]
+        :paramtype context: str
+        :keyword query: The query to be evaluated.
+        :paramtype query: Optional[str]
         :return: The relevance score.
         :rtype: Dict[str, Union[str, bool]]
         """
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py
@@ -27,9 +27,6 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project
         name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
-        focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
-    :type eval_last_turn: bool
 
         **Usage**
 
@@ -57,13 +54,11 @@ def __init__(
         self,
         credential,
         azure_ai_project,
-        eval_last_turn: bool = False,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.XPIA,
             azure_ai_project=azure_ai_project,
             credential=credential,
-            eval_last_turn=eval_last_turn,
         )
 
     @overload
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py
@@ -15,6 +15,7 @@
 from azure.ai.evaluation._common.utils import validate_azure_ai_project
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._http_utils import get_async_http_client
+from azure.ai.evaluation._model_configurations import AzureAIProject
 from azure.ai.evaluation.simulator import AdversarialScenario
 from azure.ai.evaluation.simulator._adversarial_scenario import _UnstableAdversarialScenario
 from azure.core.credentials import TokenCredential
@@ -48,7 +49,7 @@ class AdversarialSimulator:
     :type credential: ~azure.core.credentials.TokenCredential
     """
 
-    def __init__(self, *, azure_ai_project: dict, credential):
+    def __init__(self, *, azure_ai_project: AzureAIProject, credential: TokenCredential):
         """Constructor."""
 
         try:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py
@@ -11,6 +11,7 @@
 from azure.ai.evaluation._common.utils import validate_azure_ai_project
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation.simulator import AdversarialScenario
+from azure.ai.evaluation._model_configurations import AzureAIProject
 from azure.core.credentials import TokenCredential
 
 from ._adversarial_simulator import AdversarialSimulator
@@ -32,7 +33,7 @@ class DirectAttackSimulator:
     :type credential: ~azure.core.credentials.TokenCredential
     """
 
-    def __init__(self, *, azure_ai_project: dict, credential):
+    def __init__(self, *, azure_ai_project: AzureAIProject, credential: TokenCredential):
         """Constructor."""
 
         try:
@@ -125,7 +126,7 @@ async def __call__(
          - '**$schema**': A string indicating the schema URL for the conversation format.
 
          The 'content' for 'assistant' role messages may includes the messages that your callback returned.
-        :rtype: Dict[str, [List[Dict[str, Any]]]] with two elements
+        :rtype: Dict[str, [List[Dict[str, Any]]]]
 
         **Output format**
 
@@ -179,7 +180,7 @@ async def __call__(
             randomization_seed = randint(0, 1000000)
 
         regular_sim = AdversarialSimulator(
-            azure_ai_project=cast(dict, self.azure_ai_project), credential=self.credential
+            azure_ai_project=self.azure_ai_project, credential=self.credential
         )
         regular_sim_results = await regular_sim(
             scenario=scenario,
@@ -193,7 +194,7 @@ async def __call__(
             randomize_order=False,
             randomization_seed=randomization_seed,
         )
-        jb_sim = AdversarialSimulator(azure_ai_project=cast(dict, self.azure_ai_project), credential=self.credential)
+        jb_sim = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
         jb_sim_results = await jb_sim(
             scenario=scenario,
             target=target,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py
@@ -13,6 +13,7 @@
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation.simulator import AdversarialScenarioJailbreak, SupportedLanguages
+from azure.ai.evaluation._model_configurations import AzureAIProject
 from azure.core.credentials import TokenCredential
 
 from ._adversarial_simulator import AdversarialSimulator, JsonLineList
@@ -34,7 +35,7 @@ class IndirectAttackSimulator(AdversarialSimulator):
     :type credential: ~azure.core.credentials.TokenCredential
     """
 
-    def __init__(self, *, azure_ai_project: dict, credential):
+    def __init__(self, *, azure_ai_project: AzureAIProject, credential: TokenCredential):
         """Constructor."""
 
         try:
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py
@@ -285,12 +285,13 @@ def test_content_safety_evaluator_self_harm(self, project_scope, azure_cred, sim
         assert score2["evaluation_per_turn"]["self_harm_score"][1] > 0
         assert all(score2["evaluation_per_turn"]["self_harm_reason"]), "self_harm_reason must not be None or empty."
 
-        # Test eval_last_turn behavior
-        eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True)
-        score3 = eval_fn_last(conversation=simple_conversation)
-        assert score3["self_harm"] in (level.value for level in HarmSeverityLevel)
-        assert score3["self_harm_score"] >= 1
-        assert score3["self_harm_reason"], "self_harm_reason must not be None or empty."
+        # removed eval_last_turn from evaluators since behavior is incorrect. We can re-enable when
+        # it's added back to evaluators
+        # eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True)
+        # score3 = eval_fn_last(conversation=simple_conversation)
+        # assert score3["self_harm"] == "Very low"
+        # assert score3["self_harm_score"] >= 1
+        # assert score3["self_harm_reason"], "self_harm_reason must not be None or empty."
 
     def test_content_safety_evaluator_hate_unfairness(self, project_scope, azure_cred, simple_conversation):
         eval_fn = HateUnfairnessEvaluator(azure_cred, project_scope)