Azure · nagkumar91 · May 28, 2025 · May 28, 2025 · May 29, 2025 · May 29, 2025
@@ -5,11 +5,19 @@
 ### Breaking Changes
 
 ### Features Added
+
 - Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.
 - Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used.
 
 ### Bugs Fixed
 
+- [Bug](https://github.com/Azure/azure-sdk-for-python/issues/39909): Added `is_reasoning_model` keyword parameter to all evaluators
+    (`SimilarityEvaluator`, `RelevanceEvaluator`, `CoherenceEvaluator`, `FluencyEvaluator`,
+    `RetrievalEvaluator`, `GroundednessEvaluator`, `IntentResolutionEvaluator`,
+    `ResponseCompletenessEvaluator`, `TaskAdherenceEvaluator`, `ToolCallAccuracyEvaluator`).
+    When set, evaluator configuration is adjusted appropriately for reasoning models.
+    `QAEvaluator` now propagates this parameter to its child evaluators.
+
 ### Other Changes
 
 ## 1.10.0 (2025-07-31)

@@ -12,17 +12,22 @@
 
 class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
-    Evaluates coherence score for a given query and response or a multi-turn conversation, including reasoning.
+    Evaluates coherence for a given query and response or a multi-turn
+    conversation, including reasoning.
 
-    The coherence measure assesses the ability of the language model to generate text that reads naturally,
-    flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability
-    and user-friendliness of a model's generated responses in real-world applications.
+    The coherence measure assesses the model's ability to generate text that
+    reads naturally, flows smoothly, and resembles human-like language. Use it
+    when assessing the readability and user-friendliness of responses.
 
     :param model_config: Configuration for the Azure OpenAI model.
-    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+    :type model_config:
+        Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param threshold: The threshold for the coherence evaluator. Default is 3.
     :type threshold: int
+    :keyword is_reasoning_model: (Preview) config for chat completions is
+        updated to use reasoning models
+    :type is_reasoning_model: bool
 
     .. admonition:: Example:
 
@@ -31,7 +36,8 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END coherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject
+            :caption: Initialize and call CoherenceEvaluator using
+                azure.ai.evaluation.AzureAIProject
 
     .. admonition:: Example using Azure AI Project URL:
 
@@ -40,7 +46,8 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END coherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
+            :caption: Initialize and call CoherenceEvaluator using Azure AI
+                Project URL in following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
 
     .. admonition:: Example with Threshold:
@@ -50,23 +57,24 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END threshold_coherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize with threshold and call a CoherenceEvaluator with a query and response.
+            :caption: Initialize with threshold and call a CoherenceEvaluator
+                with a query and response.
 
     .. note::
 
-        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
-        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
-        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+    To align with support of diverse models, an output key without the
+    `gpt_` prefix has been added. The old key with the `gpt_` prefix is
+    still present for compatibility; however, it will be deprecated.
     """
 
     _PROMPTY_FILE = "coherence.prompty"
     _RESULT_KEY = "coherence"
 
     id = "azureai://built-in/evaluators/coherence"
-    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    """Evaluator identifier, experimental to be used only with cloud evaluation"""
-    """Evaluator identifier, experimental to be used only with cloud evaluation"""
+    """Evaluator identifier, experimental, to be used only with cloud evaluation"""
-    """Evaluator identifier, experimental to be used only with cloud evaluation"""
+    """Evaluator identifier, experimental, to be used only with cloud evaluation"""
 
     @override
-    def __init__(self, model_config, *, threshold=3):
+    def __init__(self, model_config, *, threshold=3, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -77,6 +85,7 @@ def __init__(self, model_config, *, threshold=3):
             result_key=self._RESULT_KEY,
             threshold=threshold,
             _higher_is_better=self._higher_is_better,
+            **kwargs,
         )
 
     @overload
@@ -104,9 +113,11 @@ def __call__(
     ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate coherence for a conversation
 
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages", and potentially a global context under the key "context". Conversation turns are expected
-            to be dictionaries with keys "content", "role", and possibly "context".
+        :keyword conversation: The conversation to evaluate. Expected to
+            contain a list of conversation turns under the key "messages",
+            and optionally a global context under the key "context". Turns are
+            dictionaries with keys "content", "role", and possibly
+            "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The coherence score.
         :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
@@ -118,19 +129,22 @@ def __call__(  # pylint: disable=docstring-missing-param
         *args,
         **kwargs,
     ):
-        """Evaluate coherence. Accepts either a query and response for a single evaluation,
-        or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
-        turns, the evaluator will aggregate the results of each turn.
+        """Evaluate coherence.
+
+        Accepts a query/response for a single evaluation, or a conversation
+        for a multi-turn evaluation. If the conversation has more than one
+        pair of turns, results are aggregated.
 
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: Optional[str]
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
+        :keyword conversation: The conversation to evaluate. Expected to
+            contain conversation turns under the key "messages" as
+            dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The relevance score.
-        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
+        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str,
+            List[float]]]]]
         """
         return super().__call__(*args, **kwargs)
@@ -4,7 +4,9 @@
 from concurrent.futures import as_completed
 from typing import TypeVar, Dict, List
 
-from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
+from azure.ai.evaluation._legacy._adapters.tracing import (
+    ThreadPoolExecutorWithContext as ThreadPoolExecutor,
+)
 from typing_extensions import override
 
 from azure.ai.evaluation._evaluators._common import EvaluatorBase

@@ -15,8 +15,17 @@
 
 from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
 from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
-from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
-from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
+from azure.ai.evaluation._exceptions import (
+    EvaluationException,
+    ErrorBlame,
+    ErrorCategory,
+    ErrorTarget,
+)
+from ..._common.utils import (
+    construct_prompty_model_config,
+    validate_model_config,
+    parse_quality_evaluator_reason_score,
+)
 from . import EvaluatorBase
 
 try:
@@ -71,7 +80,11 @@ def __init__(
         self._prompty_file = prompty_file
         self._threshold = threshold
         self._higher_is_better = _higher_is_better
-        super().__init__(eval_last_turn=eval_last_turn, threshold=threshold, _higher_is_better=_higher_is_better)
+        super().__init__(
+            eval_last_turn=eval_last_turn,
+            threshold=threshold,
+            _higher_is_better=_higher_is_better,
+        )
 
         subclass_name = self.__class__.__name__
         user_agent = f"{UserAgentSingleton().value} (type=evaluator subtype={subclass_name})"
@@ -82,7 +95,9 @@ def __init__(
         )
 
         self._flow = AsyncPrompty.load(
-            source=self._prompty_file, model=prompty_model_config, is_reasoning_model=self._is_reasoning_model
+            source=self._prompty_file,
+            model=prompty_model_config,
+            is_reasoning_model=self._is_reasoning_model,
         )
 
     # __call__ not overridden here because child classes have such varied signatures that there's no point

@@ -13,18 +13,24 @@
 
 class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
-    Evaluates the fluency of a given response or a multi-turn conversation, including reasoning.
+    Evaluates the fluency of a given response or a multi-turn conversation,
+    including reasoning.
 
-    The fluency measure assesses the extent to which the generated text conforms to grammatical rules, syntactic
-    structures, and appropriate vocabulary usage, resulting in linguistically correct responses.
+    The fluency measure assesses the extent to which generated text conforms
+    to grammar, syntax, and appropriate vocabulary, resulting in linguistically
+    correct responses.
 
-    Fluency scores range from 1 to 5, with 1 being the least fluent and 5 being the most fluent.
+    Fluency scores range from 1 to 5 (1 = least fluent, 5 = most fluent).
 
     :param model_config: Configuration for the Azure OpenAI model.
-    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+    :type model_config:
+        Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param threshold: The threshold for the fluency evaluator. Default is 3.
     :type threshold: int
+    :keyword is_reasoning_model: (Preview) config for chat completions is
+        updated to use reasoning models
+    :type is_reasoning_model: bool
 
     .. admonition:: Example:
 
@@ -51,24 +57,25 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END fluency_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call FluencyEvaluator using Azure AI Project URL in the following format
+            :caption: Initialize and call FluencyEvaluator using Azure AI
+                Project URL in the following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
 
     .. note::
 
-        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
-        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
-        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+    To align with support of diverse models, an output key without the
+    `gpt_` prefix has been added. The old key with the `gpt_` prefix is
+    still present for compatibility and will be deprecated.
     """
 
     _PROMPTY_FILE = "fluency.prompty"
     _RESULT_KEY = "fluency"
 
     id = "azureai://built-in/evaluators/fluency"
-    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    """Evaluator identifier for cloud evaluation."""
 
     @override
-    def __init__(self, model_config, *, threshold=3):
+    def __init__(self, model_config, *, threshold=3, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -79,6 +86,7 @@ def __init__(self, model_config, *, threshold=3):
             result_key=self._RESULT_KEY,
             threshold=threshold,
             _higher_is_better=self._higher_is_better,
+            **kwargs,
         )
 
     @overload
@@ -103,9 +111,10 @@ def __call__(
     ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
         """Evaluate fluency for a conversation
 
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages", and potentially a global context under the key "context". Conversation turns are expected
-            to be dictionaries with keys "content", "role", and possibly "context".
+        :keyword conversation: The conversation to evaluate. Expected to
+            contain turns under the key "messages", and optionally a global
+            context under the key "context". Turns are dictionaries with
+            keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The fluency score
         :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
@@ -118,16 +127,19 @@ def __call__(  # pylint: disable=docstring-missing-param
         **kwargs,
     ):
         """
-        Evaluate fluency. Accepts either a response for a single evaluation,
-        or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
-        the evaluator will aggregate the results of each turn.
-
-        :keyword response: The response to be evaluated. Mutually exclusive with the "conversation" parameter.
-        :paramtype response: Optional[str]
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role".
-        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The fluency score.
-        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
+            Evaluate fluency. Accepts either a response for a single evaluation,
+        or a conversation for a multi-turn evaluation. If the conversation has
+        more than one turn, the evaluator will aggregate per-turn results.
+
+            :keyword response: The response to be evaluated. Mutually exclusive
+                with the "conversation" parameter.
+            :paramtype response: Optional[str]
+            :keyword conversation: The conversation to evaluate. Expected to
+                contain turns under the key "messages" as dictionaries with
+                keys "content" and "role".
+            :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+            :return: The fluency score.
+            :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str,
+                List[float]]]]]
         """
         return super().__call__(*args, **kwargs)
@@ -1,7 +1,9 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-import os, logging
+import os
+import logging
+from inspect import signature
 from typing import Dict, List, Optional, Union
 
 from typing_extensions import overload, override
@@ -49,6 +51,9 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param threshold: The threshold for the groundedness evaluator. Default is 3.
     :type threshold: int
+    :keyword is_reasoning_model: (Preview) config for chat completions is
+        updated to use reasoning models
+    :type is_reasoning_model: bool
 
     .. admonition:: Example:
 
@@ -105,10 +110,16 @@ def __init__(self, model_config, *, threshold=3, **kwargs):
             result_key=self._RESULT_KEY,
             threshold=threshold,
             _higher_is_better=self._higher_is_better,
+            **kwargs,
         )
         self._model_config = model_config
         self.threshold = threshold
-        # Needs to be set because it's used in call method to re-validate prompt if `query` is provided
+
+        # Cache whether AsyncPrompty.load supports the is_reasoning_model parameter.
+        try:
+            self._has_is_reasoning_model_param: bool = "is_reasoning_model" in signature(AsyncPrompty.load).parameters
+        except Exception:  # Very defensive: if inspect fails, assume not supported
+            self._has_is_reasoning_model_param = False
 
     @overload
     def __call__(
@@ -202,7 +213,18 @@ def __call__(  # pylint: disable=docstring-missing-param
                 self._DEFAULT_OPEN_API_VERSION,
                 UserAgentSingleton().value,
             )
-            self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
+
+            if self._has_is_reasoning_model_param:
+                self._flow = AsyncPrompty.load(
+                    source=self._prompty_file,
+                    model=prompty_model_config,
+                    is_reasoning_model=self._is_reasoning_model,
+                )
+            else:
+                self._flow = AsyncPrompty.load(
+                    source=self._prompty_file,
+                    model=prompty_model_config,
+                )
 
         return super().__call__(*args, **kwargs)
 
@@ -282,4 +304,4 @@ def _get_context_from_agent_response(self, response, tool_definitions):
             logger.debug(f"Error extracting context from agent response : {str(ex)}")
             context = ""
 
-        return context if context else None
+        return context
-        return context
+        return context if context else None
-        return context
+        return context if context else None