Azure · nagkumar91 · May 28, 2025 · May 28, 2025 · May 29, 2025 · May 29, 2025
@@ -7,6 +7,12 @@
 ### Features Added
 - Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.
 - Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used.
+- Preview: Added `is_reasoning_model` keyword parameter to all prompty-based evaluators
+    (`SimilarityEvaluator`, `RelevanceEvaluator`, `CoherenceEvaluator`, `FluencyEvaluator`,
+    `RetrievalEvaluator`, `GroundednessEvaluator`, `IntentResolutionEvaluator`,
+    `ResponseCompletenessEvaluator`, `TaskAdherenceEvaluator`, `ToolCallAccuracyEvaluator`).
+    When set, evaluator prompty configuration is adjusted appropriately for reasoning models.
+    `QAEvaluator` now propagates this parameter to its prompty-based child evaluators.
 
 ### Bugs Fixed
 

@@ -12,17 +12,22 @@
 
 class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
-    Evaluates coherence score for a given query and response or a multi-turn conversation, including reasoning.
+    Evaluates coherence for a given query and response or a multi-turn
+    conversation, including reasoning.
 
-    The coherence measure assesses the ability of the language model to generate text that reads naturally,
-    flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability
-    and user-friendliness of a model's generated responses in real-world applications.
+    The coherence measure assesses the model's ability to generate text that
+    reads naturally, flows smoothly, and resembles human-like language. Use it
+    when assessing the readability and user-friendliness of responses.
 
     :param model_config: Configuration for the Azure OpenAI model.
-    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+    :type model_config:
+        Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param threshold: The threshold for the coherence evaluator. Default is 3.
     :type threshold: int
+    :keyword is_reasoning_model: (Preview) Adjusts prompty config
+        for reasoning models when True.
+    :paramtype is_reasoning_model: bool
 
     .. admonition:: Example:
 
@@ -31,7 +36,8 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END coherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject
+            :caption: Initialize and call CoherenceEvaluator using
+                azure.ai.evaluation.AzureAIProject
 
     .. admonition:: Example using Azure AI Project URL:
 
@@ -40,7 +46,8 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END coherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
+            :caption: Initialize and call CoherenceEvaluator using Azure AI
+                Project URL in following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
 
     .. admonition:: Example with Threshold:
@@ -50,23 +57,24 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END threshold_coherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize with threshold and call a CoherenceEvaluator with a query and response.
+            :caption: Initialize with threshold and call a CoherenceEvaluator
+                with a query and response.
 
     .. note::
 
-        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
-        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
-        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+    To align with support of diverse models, an output key without the
+    `gpt_` prefix has been added. The old key with the `gpt_` prefix is
+    still present for compatibility; however, it will be deprecated.
     """
 
     _PROMPTY_FILE = "coherence.prompty"
     _RESULT_KEY = "coherence"
 
     id = "azureai://built-in/evaluators/coherence"
-    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    """Evaluator identifier for cloud evaluation."""
 
     @override
-    def __init__(self, model_config, *, threshold=3):
+    def __init__(self, model_config, *, threshold=3, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -77,6 +85,7 @@ def __init__(self, model_config, *, threshold=3):
             result_key=self._RESULT_KEY,
             threshold=threshold,
             _higher_is_better=self._higher_is_better,
+            **kwargs,
         )
 
     @overload
@@ -104,9 +113,11 @@ def __call__(
     ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate coherence for a conversation
 
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages", and potentially a global context under the key "context". Conversation turns are expected
-            to be dictionaries with keys "content", "role", and possibly "context".
+        :keyword conversation: The conversation to evaluate. Expected to
+            contain a list of conversation turns under the key "messages",
+            and optionally a global context under the key "context". Turns are
+            dictionaries with keys "content", "role", and possibly
+            "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The coherence score.
         :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
@@ -118,19 +129,22 @@ def __call__(  # pylint: disable=docstring-missing-param
         *args,
         **kwargs,
     ):
-        """Evaluate coherence. Accepts either a query and response for a single evaluation,
-        or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
-        turns, the evaluator will aggregate the results of each turn.
+        """Evaluate coherence.
+
+        Accepts a query/response for a single evaluation, or a conversation
+        for a multi-turn evaluation. If the conversation has more than one
+        pair of turns, results are aggregated.
 
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: Optional[str]
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
+        :keyword conversation: The conversation to evaluate. Expected to
+            contain conversation turns under the key "messages" as
+            dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The relevance score.
-        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
+        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str,
+            List[float]]]]]
         """
         return super().__call__(*args, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -121,14 +121,18 @@ def __init__(
         not_singleton_inputs: List[str] = ["conversation", "kwargs"],
         eval_last_turn: bool = False,
         conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
-        conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
+        conversation_aggregator_override: Optional[
+            Callable[[List[float]], float]
+        ] = None,
         _higher_is_better: Optional[bool] = True,
     ):
         self._not_singleton_inputs = not_singleton_inputs
         self._eval_last_turn = eval_last_turn
         self._singleton_inputs = self._derive_singleton_inputs()
         self._async_evaluator = AsyncEvaluatorBase(self._real_call)
-        self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
+        self._conversation_aggregation_function = GetAggregator(
+            conversation_aggregation_type
+        )
         self._higher_is_better = _higher_is_better
         self._threshold = threshold
         if conversation_aggregator_override is not None:
@@ -190,7 +194,10 @@ def _derive_singleton_inputs(self) -> List[List[str]]:
         overload_inputs = []
         for call_signature in call_signatures:
             params = call_signature.parameters
-            if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
+            if any(
+                not_singleton_input in params
+                for not_singleton_input in self._not_singleton_inputs
+            ):
                 continue
             # exclude self since it is not a singleton input
             overload_inputs.append([p for p in params if p != "self"])
@@ -234,7 +241,11 @@ def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
                 best_match = inputs
 
         # Return the best match or the first overload as fallback
-        return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
+        return (
+            best_match
+            if best_match is not None
+            else (overload_inputs[0] if overload_inputs else [])
+        )
 
     def _get_all_singleton_inputs(self) -> List[str]:
         """Get a flattened list of all possible singleton inputs across all overloads.
@@ -345,12 +356,16 @@ def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
             if len(user_messages) != len(assistant_messages):
                 raise EvaluationException(
                     message="Mismatched number of user and assistant messages.",
-                    internal_message=("Mismatched number of user and assistant messages."),
+                    internal_message=(
+                        "Mismatched number of user and assistant messages."
+                    ),
                 )
             if len(assistant_messages) > 1:
                 raise EvaluationException(
                     message="Conversation can have only one assistant message.",
-                    internal_message=("Conversation can have only one assistant message."),
+                    internal_message=(
+                        "Conversation can have only one assistant message."
+                    ),
                 )
             eval_conv_inputs = []
             for user_msg, assist_msg in zip(user_messages, assistant_messages):
@@ -359,12 +374,16 @@ def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
                     conv_messages.append(system_messages[0])
                 conv_messages.append(user_msg)
                 conv_messages.append(assist_msg)
-                eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)})
+                eval_conv_inputs.append(
+                    {"conversation": Conversation(messages=conv_messages)}
+                )
             return eval_conv_inputs
 
         return multi_modal_converter
 
-    def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]:
+    def _convert_kwargs_to_eval_input(
+        self, **kwargs
+    ) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]:
         """Convert an arbitrary input into a list of inputs for evaluators.
         It is assumed that evaluators generally make use of their inputs in one of two ways.
         Either they receive a collection of keyname inputs that are all single values
@@ -414,7 +433,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
         matching_inputs = self._get_matching_overload_inputs(**kwargs)
         if matching_inputs:
             # Check if all required inputs for this overload are provided
-            required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
+            required_singletons = {
+                key: kwargs.get(key, None) for key in matching_inputs
+            }
             required_singletons = remove_optional_singletons(self, required_singletons)
             if all(value is not None for value in required_singletons.values()):
                 return [singletons]
@@ -438,11 +459,17 @@ def _is_multi_modal_conversation(self, conversation: Dict) -> bool:
             if "content" in message:
                 content = message.get("content", "")
                 if isinstance(content, list):
-                    if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
+                    if any(
+                        item.get("type") == "image_url"
+                        and "url" in item.get("image_url", {})
+                        for item in content
+                    ):
                         return True
         return False
 
-    def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
+    def _aggregate_results(
+        self, per_turn_results: List[DoEvalResult[T_EvalValue]]
+    ) -> AggregateResult[T_EvalValue]:
         """Aggregate the evaluation results of each conversation turn into a single result.
 
         Exact implementation might need to vary slightly depending on the results produced.
@@ -472,7 +499,9 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]])
         # Find and average all numeric values
         for metric, values in evaluation_per_turn.items():
             if all(isinstance(value, (int, float)) for value in values):
-                aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values))
+                aggregated[metric] = self._conversation_aggregation_function(
+                    cast(List[Union[int, float]], values)
+                )
         # Slap the per-turn results back in.
         aggregated["evaluation_per_turn"] = evaluation_per_turn
         return aggregated
@@ -489,17 +518,28 @@ def _parse_tools_from_response(self, response):
         if isinstance(response, list):
             for message in response:
                 # Extract tool calls from assistant messages
-                if message.get("role") == "assistant" and isinstance(message.get("content"), list):
+                if message.get("role") == "assistant" and isinstance(
+                    message.get("content"), list
+                ):
                     for content_item in message.get("content"):
-                        if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
+                        if (
+                            isinstance(content_item, dict)
+                            and content_item.get("type") == "tool_call"
+                        ):
                             tool_calls.append(content_item)
 
                 # Extract tool results from tool messages
                 elif message.get("role") == "tool" and message.get("tool_call_id"):
                     tool_call_id = message.get("tool_call_id")
-                    if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
+                    if (
+                        isinstance(message.get("content"), list)
+                        and len(message.get("content")) > 0
+                    ):
                         result_content = message.get("content")[0]
-                        if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
+                        if (
+                            isinstance(result_content, dict)
+                            and result_content.get("type") == "tool_result"
+                        ):
                             tool_results_map[tool_call_id] = result_content
 
         # Attach results to their corresponding calls
@@ -510,7 +550,9 @@ def _parse_tools_from_response(self, response):
 
         return tool_calls
 
-    async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
+    async def _real_call(
+        self, **kwargs
+    ) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
         """The asynchronous call where real end-to-end evaluation logic is performed.
 
         :keyword kwargs: The inputs to evaluate.
@@ -563,7 +605,9 @@ def _to_async(self) -> "AsyncEvaluatorBase":
 
     @experimental
     @final
-    def _set_conversation_aggregation_type(self, conversation_aggregation_type: _AggregationType) -> None:
+    def _set_conversation_aggregation_type(
+        self, conversation_aggregation_type: _AggregationType
+    ) -> None:
         """Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for
         multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a
         multi-turn conversation into a single top-level result.
@@ -572,11 +616,15 @@ def _set_conversation_aggregation_type(self, conversation_aggregation_type: _Agg
             results of a conversation to produce a single result.
         :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
         """
-        self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
+        self._conversation_aggregation_function = GetAggregator(
+            conversation_aggregation_type
+        )
 
     @experimental
     @final
-    def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None:
+    def _set_conversation_aggregator(
+        self, aggregator: Callable[[List[float]], float]
+    ) -> None:
         """Set the conversation aggregator function directly. This function will be applied to all numeric outputs
         of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per
         evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not
@@ -606,7 +654,9 @@ class AsyncEvaluatorBase:
     to ensure that no one ever needs to extend or otherwise modify this class directly.
     """
 
-    def __init__(self, real_call):  # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT YOU ABOUT META GENERATION
+    def __init__(
+        self, real_call
+    ):  # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT YOU ABOUT META GENERATION
         self._real_call = real_call
 
     # Don't look at my shame. Nothing to see here....