diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 75670e2c3849..4a999adb7a18 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -5,11 +5,19 @@ ### Breaking Changes ### Features Added + - Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter. - Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used. ### Bugs Fixed +- [Bug](https://github.com/Azure/azure-sdk-for-python/issues/39909): Added `is_reasoning_model` keyword parameter to all evaluators + (`SimilarityEvaluator`, `RelevanceEvaluator`, `CoherenceEvaluator`, `FluencyEvaluator`, + `RetrievalEvaluator`, `GroundednessEvaluator`, `IntentResolutionEvaluator`, + `ResponseCompletenessEvaluator`, `TaskAdherenceEvaluator`, `ToolCallAccuracyEvaluator`). + When set, evaluator configuration is adjusted appropriately for reasoning models. + `QAEvaluator` now propagates this parameter to its child evaluators. + ### Other Changes ## 1.10.0 (2025-07-31) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py index 66cc593452fb..6644bceaa263 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py @@ -12,17 +12,22 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]): """ - Evaluates coherence score for a given query and response or a multi-turn conversation, including reasoning. + Evaluates coherence for a given query and response or a multi-turn + conversation, including reasoning. - The coherence measure assesses the ability of the language model to generate text that reads naturally, - flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability - and user-friendliness of a model's generated responses in real-world applications. + The coherence measure assesses the model's ability to generate text that + reads naturally, flows smoothly, and resembles human-like language. Use it + when assessing the readability and user-friendliness of responses. :param model_config: Configuration for the Azure OpenAI model. - :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, + :type model_config: + Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, ~azure.ai.evaluation.OpenAIModelConfiguration] :param threshold: The threshold for the coherence evaluator. Default is 3. :type threshold: int + :keyword is_reasoning_model: (Preview) config for chat completions is + updated to use reasoning models + :type is_reasoning_model: bool .. admonition:: Example: @@ -31,7 +36,8 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]): :end-before: [END coherence_evaluator] :language: python :dedent: 8 - :caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject + :caption: Initialize and call CoherenceEvaluator using + azure.ai.evaluation.AzureAIProject .. admonition:: Example using Azure AI Project URL: @@ -40,7 +46,8 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]): :end-before: [END coherence_evaluator] :language: python :dedent: 8 - :caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format + :caption: Initialize and call CoherenceEvaluator using Azure AI + Project URL in following format https://{resource_name}.services.ai.azure.com/api/projects/{project_name} .. admonition:: Example with Threshold: @@ -50,23 +57,24 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]): :end-before: [END threshold_coherence_evaluator] :language: python :dedent: 8 - :caption: Initialize with threshold and call a CoherenceEvaluator with a query and response. + :caption: Initialize with threshold and call a CoherenceEvaluator + with a query and response. .. note:: - To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added. - To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output; - however, it is recommended to use the new key moving forward as the old key will be deprecated in the future. + To align with support of diverse models, an output key without the + `gpt_` prefix has been added. The old key with the `gpt_` prefix is + still present for compatibility; however, it will be deprecated. """ _PROMPTY_FILE = "coherence.prompty" _RESULT_KEY = "coherence" id = "azureai://built-in/evaluators/coherence" - """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" + """Evaluator identifier, experimental to be used only with cloud evaluation""" @override - def __init__(self, model_config, *, threshold=3): + def __init__(self, model_config, *, threshold=3, **kwargs): current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) self._threshold = threshold @@ -77,6 +85,7 @@ def __init__(self, model_config, *, threshold=3): result_key=self._RESULT_KEY, threshold=threshold, _higher_is_better=self._higher_is_better, + **kwargs, ) @overload @@ -104,9 +113,11 @@ def __call__( ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluate coherence for a conversation - :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the - key "messages", and potentially a global context under the key "context". Conversation turns are expected - to be dictionaries with keys "content", "role", and possibly "context". + :keyword conversation: The conversation to evaluate. Expected to + contain a list of conversation turns under the key "messages", + and optionally a global context under the key "context". Turns are + dictionaries with keys "content", "role", and possibly + "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The coherence score. :rtype: Dict[str, Union[float, Dict[str, List[float]]]] @@ -118,19 +129,22 @@ def __call__( # pylint: disable=docstring-missing-param *args, **kwargs, ): - """Evaluate coherence. Accepts either a query and response for a single evaluation, - or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of - turns, the evaluator will aggregate the results of each turn. + """Evaluate coherence. + + Accepts a query/response for a single evaluation, or a conversation + for a multi-turn evaluation. If the conversation has more than one + pair of turns, results are aggregated. :keyword query: The query to be evaluated. :paramtype query: str :keyword response: The response to be evaluated. :paramtype response: Optional[str] - :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the - key "messages". Conversation turns are expected - to be dictionaries with keys "content" and "role". + :keyword conversation: The conversation to evaluate. Expected to + contain conversation turns under the key "messages" as + dictionaries with keys "content" and "role". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The relevance score. - :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] + :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, + List[float]]]]] """ return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py index 7eafa42a2926..544336fb4adf 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py @@ -4,7 +4,9 @@ from concurrent.futures import as_completed from typing import TypeVar, Dict, List -from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor +from azure.ai.evaluation._legacy._adapters.tracing import ( + ThreadPoolExecutorWithContext as ThreadPoolExecutor, +) from typing_extensions import override from azure.ai.evaluation._evaluators._common import EvaluatorBase diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py index 1e0dfe9d5ce1..6cf26e352e2d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py @@ -15,8 +15,17 @@ from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING -from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget -from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score +from azure.ai.evaluation._exceptions import ( + EvaluationException, + ErrorBlame, + ErrorCategory, + ErrorTarget, +) +from ..._common.utils import ( + construct_prompty_model_config, + validate_model_config, + parse_quality_evaluator_reason_score, +) from . import EvaluatorBase try: @@ -71,7 +80,11 @@ def __init__( self._prompty_file = prompty_file self._threshold = threshold self._higher_is_better = _higher_is_better - super().__init__(eval_last_turn=eval_last_turn, threshold=threshold, _higher_is_better=_higher_is_better) + super().__init__( + eval_last_turn=eval_last_turn, + threshold=threshold, + _higher_is_better=_higher_is_better, + ) subclass_name = self.__class__.__name__ user_agent = f"{UserAgentSingleton().value} (type=evaluator subtype={subclass_name})" @@ -82,7 +95,9 @@ def __init__( ) self._flow = AsyncPrompty.load( - source=self._prompty_file, model=prompty_model_config, is_reasoning_model=self._is_reasoning_model + source=self._prompty_file, + model=prompty_model_config, + is_reasoning_model=self._is_reasoning_model, ) # __call__ not overridden here because child classes have such varied signatures that there's no point diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py index 989f9e06b4af..47e63787e218 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py @@ -13,18 +13,24 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]): """ - Evaluates the fluency of a given response or a multi-turn conversation, including reasoning. + Evaluates the fluency of a given response or a multi-turn conversation, + including reasoning. - The fluency measure assesses the extent to which the generated text conforms to grammatical rules, syntactic - structures, and appropriate vocabulary usage, resulting in linguistically correct responses. + The fluency measure assesses the extent to which generated text conforms + to grammar, syntax, and appropriate vocabulary, resulting in linguistically + correct responses. - Fluency scores range from 1 to 5, with 1 being the least fluent and 5 being the most fluent. + Fluency scores range from 1 to 5 (1 = least fluent, 5 = most fluent). :param model_config: Configuration for the Azure OpenAI model. - :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, + :type model_config: + Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, ~azure.ai.evaluation.OpenAIModelConfiguration] :param threshold: The threshold for the fluency evaluator. Default is 3. :type threshold: int + :keyword is_reasoning_model: (Preview) config for chat completions is + updated to use reasoning models + :type is_reasoning_model: bool .. admonition:: Example: @@ -51,24 +57,25 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]): :end-before: [END fluency_evaluator] :language: python :dedent: 8 - :caption: Initialize and call FluencyEvaluator using Azure AI Project URL in the following format + :caption: Initialize and call FluencyEvaluator using Azure AI + Project URL in the following format https://{resource_name}.services.ai.azure.com/api/projects/{project_name} .. note:: - To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added. - To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output; - however, it is recommended to use the new key moving forward as the old key will be deprecated in the future. + To align with support of diverse models, an output key without the + `gpt_` prefix has been added. The old key with the `gpt_` prefix is + still present for compatibility and will be deprecated. """ _PROMPTY_FILE = "fluency.prompty" _RESULT_KEY = "fluency" id = "azureai://built-in/evaluators/fluency" - """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" + """Evaluator identifier for cloud evaluation.""" @override - def __init__(self, model_config, *, threshold=3): + def __init__(self, model_config, *, threshold=3, **kwargs): current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) self._threshold = threshold @@ -79,6 +86,7 @@ def __init__(self, model_config, *, threshold=3): result_key=self._RESULT_KEY, threshold=threshold, _higher_is_better=self._higher_is_better, + **kwargs, ) @overload @@ -103,9 +111,10 @@ def __call__( ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluate fluency for a conversation - :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the - key "messages", and potentially a global context under the key "context". Conversation turns are expected - to be dictionaries with keys "content", "role", and possibly "context". + :keyword conversation: The conversation to evaluate. Expected to + contain turns under the key "messages", and optionally a global + context under the key "context". Turns are dictionaries with + keys "content", "role", and possibly "context". :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The fluency score :rtype: Dict[str, Union[float, Dict[str, List[float]]]] @@ -118,16 +127,19 @@ def __call__( # pylint: disable=docstring-missing-param **kwargs, ): """ - Evaluate fluency. Accepts either a response for a single evaluation, - or a conversation for a multi-turn evaluation. If the conversation has more than one turn, - the evaluator will aggregate the results of each turn. - - :keyword response: The response to be evaluated. Mutually exclusive with the "conversation" parameter. - :paramtype response: Optional[str] - :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the - key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role". - :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] - :return: The fluency score. - :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]] + Evaluate fluency. Accepts either a response for a single evaluation, + or a conversation for a multi-turn evaluation. If the conversation has + more than one turn, the evaluator will aggregate per-turn results. + + :keyword response: The response to be evaluated. Mutually exclusive + with the "conversation" parameter. + :paramtype response: Optional[str] + :keyword conversation: The conversation to evaluate. Expected to + contain turns under the key "messages" as dictionaries with + keys "content" and "role". + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The fluency score. + :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, + List[float]]]]] """ return super().__call__(*args, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py index 9aa8520630fc..09c3d3c7263b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py @@ -1,7 +1,9 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -import os, logging +import os +import logging +from inspect import signature from typing import Dict, List, Optional, Union from typing_extensions import overload, override @@ -49,6 +51,9 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]): ~azure.ai.evaluation.OpenAIModelConfiguration] :param threshold: The threshold for the groundedness evaluator. Default is 3. :type threshold: int + :keyword is_reasoning_model: (Preview) config for chat completions is + updated to use reasoning models + :type is_reasoning_model: bool .. admonition:: Example: @@ -105,10 +110,16 @@ def __init__(self, model_config, *, threshold=3, **kwargs): result_key=self._RESULT_KEY, threshold=threshold, _higher_is_better=self._higher_is_better, + **kwargs, ) self._model_config = model_config self.threshold = threshold - # Needs to be set because it's used in call method to re-validate prompt if `query` is provided + + # Cache whether AsyncPrompty.load supports the is_reasoning_model parameter. + try: + self._has_is_reasoning_model_param: bool = "is_reasoning_model" in signature(AsyncPrompty.load).parameters + except Exception: # Very defensive: if inspect fails, assume not supported + self._has_is_reasoning_model_param = False @overload def __call__( @@ -202,7 +213,18 @@ def __call__( # pylint: disable=docstring-missing-param self._DEFAULT_OPEN_API_VERSION, UserAgentSingleton().value, ) - self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config) + + if self._has_is_reasoning_model_param: + self._flow = AsyncPrompty.load( + source=self._prompty_file, + model=prompty_model_config, + is_reasoning_model=self._is_reasoning_model, + ) + else: + self._flow = AsyncPrompty.load( + source=self._prompty_file, + model=prompty_model_config, + ) return super().__call__(*args, **kwargs) @@ -282,4 +304,4 @@ def _get_context_from_agent_response(self, response, tool_definitions): logger.debug(f"Error extracting context from agent response : {str(ex)}") context = "" - return context if context else None + return context diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py index df095f67ba97..d4c943c59282 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py @@ -35,6 +35,9 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]): :type similarity_threshold: int :param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5. :type f1_score_threshold: float + :keyword is_reasoning_model: (Preview) config for chat completions is + updated to use reasoning models + :type is_reasoning_model: bool :return: A callable class that evaluates and generates metrics for "question-answering" scenario. :param kwargs: Additional arguments to pass to the evaluator. :type kwargs: Any @@ -102,11 +105,31 @@ def __init__( raise TypeError(f"{name} must be an int or float, got {type(value)}") evaluators = [ - GroundednessEvaluator(model_config, threshold=groundedness_threshold), - RelevanceEvaluator(model_config, threshold=relevance_threshold), - CoherenceEvaluator(model_config, threshold=coherence_threshold), - FluencyEvaluator(model_config, threshold=fluency_threshold), - SimilarityEvaluator(model_config, threshold=similarity_threshold), + GroundednessEvaluator( + model_config, + threshold=groundedness_threshold, + **kwargs, + ), + RelevanceEvaluator( + model_config, + threshold=relevance_threshold, + **kwargs, + ), + CoherenceEvaluator( + model_config, + threshold=coherence_threshold, + **kwargs, + ), + FluencyEvaluator( + model_config, + threshold=fluency_threshold, + **kwargs, + ), + SimilarityEvaluator( + model_config, + threshold=similarity_threshold, + **kwargs, + ), F1ScoreEvaluator(threshold=f1_score_threshold), ] super().__init__(evaluators=evaluators, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py index bac157ab2623..e9a548303290 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py @@ -8,7 +8,12 @@ from typing_extensions import overload, override -from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget +from azure.ai.evaluation._exceptions import ( + EvaluationException, + ErrorBlame, + ErrorCategory, + ErrorTarget, +) from ..._common.utils import reformat_conversation_history, reformat_agent_response from azure.ai.evaluation._model_configurations import Conversation @@ -35,6 +40,9 @@ class RelevanceEvaluator(PromptyEvaluatorBase): ~azure.ai.evaluation.OpenAIModelConfiguration] :param threshold: The threshold for the relevance evaluator. Default is 3. :type threshold: int + :keyword is_reasoning_model: (Preview) config for chat completions is + updated to use reasoning models + :type is_reasoning_model: bool .. admonition:: Example: @@ -79,7 +87,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase): """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" @override - def __init__(self, model_config, *, threshold=3): + def __init__(self, model_config, *, threshold=3, **kwargs): current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) self._threshold = threshold @@ -90,6 +98,7 @@ def __init__(self, model_config, *, threshold=3): result_key=self._RESULT_KEY, threshold=threshold, _higher_is_better=self._higher_is_better, + **kwargs, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py index 1f0a886f944f..138278a0d7d8 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py @@ -8,7 +8,12 @@ from typing_extensions import overload, override -from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget +from azure.ai.evaluation._exceptions import ( + EvaluationException, + ErrorBlame, + ErrorCategory, + ErrorTarget, +) from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_score from azure.ai.evaluation._model_configurations import Conversation, Message @@ -37,6 +42,9 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]): :param model_config: Configuration for the Azure OpenAI model. :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, ~azure.ai.evaluation.OpenAIModelConfiguration] + :keyword is_reasoning_model: (Preview) config for chat completions is + updated to use reasoning models + :type is_reasoning_model: bool .. admonition:: Example: @@ -73,11 +81,22 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]): """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" @override - def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, **kwargs): + def __init__( + self, + model_config, + *, + threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, + **kwargs, + ): current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) self.threshold = threshold - super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs) + super().__init__( + model_config=model_config, + prompty_file=prompty_path, + result_key=self._RESULT_KEY, + **kwargs, + ) @overload def __call__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py index eea0cd516154..951faba0a305 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py @@ -7,7 +7,9 @@ from typing import Dict, List, Union from typing_extensions import overload, override -from azure.ai.evaluation._evaluators._common._base_prompty_eval import PromptyEvaluatorBase +from azure.ai.evaluation._evaluators._common._base_prompty_eval import ( + PromptyEvaluatorBase, +) from azure.ai.evaluation._model_configurations import Conversation logger = logging.getLogger(__name__) @@ -33,6 +35,9 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]): ~azure.ai.evaluation.OpenAIModelConfiguration] :param threshold: The threshold for the evaluation. Default is 3. :type threshold: float + :keyword is_reasoning_model: (Preview) config for chat completions is + updated to use reasoning models + :type is_reasoning_model: bool :return: A function that evaluates and generates metrics for "chat" scenario. :rtype: Callable @@ -78,7 +83,8 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]): """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" @override - def __init__(self, model_config, *, threshold: float = 3): # pylint: disable=super-init-not-called + # pylint: disable=super-init-not-called + def __init__(self, model_config, *, threshold: float = 3, **kwargs): current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) self._threshold = threshold @@ -89,6 +95,7 @@ def __init__(self, model_config, *, threshold: float = 3): # pylint: disable=su result_key=self._RESULT_KEY, threshold=threshold, _higher_is_better=self._higher_is_better, + **kwargs, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py index dd4043944a88..1ac08ef527b5 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py @@ -14,22 +14,28 @@ class SimilarityEvaluator(PromptyEvaluatorBase): """ Evaluates similarity score for a given query, response, and ground truth. - The similarity measure evaluates the likeness between a ground truth sentence (or document) and the - AI model's generated prediction. This calculation involves creating sentence-level embeddings for both - the ground truth and the model's prediction, which are high-dimensional vector representations capturing - the semantic meaning and context of the sentences. + The similarity measure evaluates the likeness between a ground truth + sentence (or document) and the AI model's generated prediction. This + involves creating sentence-level embeddings for both the ground truth and + the model's prediction. These are high-dimensional vectors capturing the + semantic meaning and context of the sentences. - Use it when you want an objective evaluation of an AI model's performance, particularly in text generation - tasks where you have access to ground truth responses. Similarity enables you to assess the generated - text's semantic alignment with the desired content, helping to gauge the model's quality and accuracy. + Use it when you need an objective evaluation of an AI model's performance, + especially for text generation with ground truth responses. Similarity + assesses semantic alignment with the desired content and helps gauge model + quality and accuracy. - Similarity scores range from 1 to 5, with 1 being the least similar and 5 being the most similar. + Similarity scores range from 1 to 5 (1 = least similar, 5 = most similar). :param model_config: Configuration for the Azure OpenAI model. - :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, + :type model_config: + Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, ~azure.ai.evaluation.OpenAIModelConfiguration] :param threshold: The threshold for the similarity evaluator. Default is 3. :type threshold: int + :keyword is_reasoning_model: (Preview) config for chat completions is + updated to use reasoning models + :type is_reasoning_model: bool .. admonition:: Example: @@ -38,7 +44,8 @@ class SimilarityEvaluator(PromptyEvaluatorBase): :end-before: [END similarity_evaluator] :language: python :dedent: 8 - :caption: Initialize and call a SimilarityEvaluator with a four-gram rouge type. + :caption: Initialize and call a SimilarityEvaluator with a + four-gram rouge type. .. admonition:: Example using Azure AI Project URL: @@ -47,7 +54,8 @@ class SimilarityEvaluator(PromptyEvaluatorBase): :end-before: [END similarity_evaluator] :language: python :dedent: 8 - :caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format + :caption: Initialize and call SimilarityEvaluator using Azure AI + Project URL in the following format https://{resource_name}.services.ai.azure.com/api/projects/{project_name} .. admonition:: Example: @@ -57,13 +65,16 @@ class SimilarityEvaluator(PromptyEvaluatorBase): :end-before: [END threshold_similarity_evaluator] :language: python :dedent: 8 - :caption: Initialize with a threshold and call a SimilarityEvaluator. + :caption: Initialize with a threshold and call a + SimilarityEvaluator. .. note:: - To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added. - To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output; - however, it is recommended to use the new key moving forward as the old key will be deprecated in the future. + To align with our support of diverse models, an output key without the + `gpt_` prefix has been added. To maintain backwards compatibility, the + old key with the `gpt_` prefix is still present in the output; however, + it is recommended to use the new key moving forward as the old key will + be deprecated in the future. """ # Constants must be defined within eval's directory to be save/loadable @@ -72,10 +83,10 @@ class SimilarityEvaluator(PromptyEvaluatorBase): _RESULT_KEY = "similarity" id = "azureai://built-in/evaluators/similarity" - """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" + """Evaluator identifier for cloud evaluation.""" @override - def __init__(self, model_config, *, threshold=3): + def __init__(self, model_config, *, threshold=3, **kwargs): current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) self._threshold = threshold @@ -86,13 +97,14 @@ def __init__(self, model_config, *, threshold=3): result_key=self._RESULT_KEY, threshold=threshold, _higher_is_better=self._higher_is_better, + **kwargs, ) # Ignoring a mypy error about having only 1 overload function. - # We want to use the overload style for all evals, even single-inputs. This is both to make - # refactoring to multi-input styles easier, stylistic consistency consistency across evals, - # and due to the fact that non-overloaded syntax now causes various parsing issues that - # we don't want to deal with. + # We want to use the overload style for all evals, even single-inputs. + # This makes refactoring to multi-input styles easier, keeps stylistic + # consistency across evals, and avoids parsing issues with non-overloaded + # syntax. @overload # type: ignore def __call__(self, *, query: str, response: str, ground_truth: str) -> Dict[str, float]: """