|
| 1 | +# --------------------------------------------------------- |
| 2 | +# Copyright (c) Microsoft Corporation. All rights reserved. |
| 3 | +# --------------------------------------------------------- |
| 4 | +import os |
| 5 | +import math |
| 6 | +import logging |
| 7 | +from typing import Dict, Union, List, Optional |
| 8 | + |
| 9 | +from typing_extensions import overload, override |
| 10 | + |
| 11 | +from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget |
| 12 | +from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase |
| 13 | +from ..._common.utils import reformat_conversation_history, reformat_agent_response, reformat_tool_definitions |
| 14 | +from azure.ai.evaluation._model_configurations import Message |
| 15 | +from azure.ai.evaluation._common._experimental import experimental |
| 16 | + |
| 17 | +logger = logging.getLogger(__name__) |
| 18 | + |
| 19 | + |
| 20 | +@experimental |
| 21 | +class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]): |
| 22 | + """The Task Success evaluator determines whether an AI agent successfully completed the requested task based on: |
| 23 | +
|
| 24 | + - Final outcome and deliverable of the task |
| 25 | + - Completeness of task requirements |
| 26 | +
|
| 27 | + This evaluator focuses solely on task completion and success, not on task adherence or intent understanding. |
| 28 | +
|
| 29 | + Scoring is binary: |
| 30 | + - TRUE: Task fully completed with usable deliverable that meets all user requirements |
| 31 | + - FALSE: Task incomplete, partially completed, or deliverable does not meet requirements |
| 32 | +
|
| 33 | + The evaluation includes task requirement analysis, outcome assessment, and completion gap identification. |
| 34 | +
|
| 35 | +
|
| 36 | + :param model_config: Configuration for the Azure OpenAI model. |
| 37 | + :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, |
| 38 | + ~azure.ai.evaluation.OpenAIModelConfiguration] |
| 39 | +
|
| 40 | + .. admonition:: Example: |
| 41 | + .. literalinclude:: ../samples/evaluation_samples_evaluate.py |
| 42 | + :start-after: [START task_success_evaluator] |
| 43 | + :end-before: [END task_success_evaluator] |
| 44 | + :language: python |
| 45 | + :dedent: 8 |
| 46 | + :caption: Initialize and call a TaskSuccessEvaluator with a query and response. |
| 47 | +
|
| 48 | + .. admonition:: Example using Azure AI Project URL: |
| 49 | +
|
| 50 | + .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py |
| 51 | + :start-after: [START task_success_evaluator] |
| 52 | + :end-before: [END task_success_evaluator] |
| 53 | + :language: python |
| 54 | + :dedent: 8 |
| 55 | + :caption: Initialize and call TaskSuccessEvaluator using Azure AI Project URL in the following format |
| 56 | + https://{resource_name}.services.ai.azure.com/api/projects/{project_name} |
| 57 | +
|
| 58 | + """ |
| 59 | + |
| 60 | + _PROMPTY_FILE = "task_success.prompty" |
| 61 | + _RESULT_KEY = "task_success" |
| 62 | + _OPTIONAL_PARAMS = ["tool_definitions"] |
| 63 | + |
| 64 | + id = "azureai://built-in/evaluators/task_success" |
| 65 | + """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" |
| 66 | + |
| 67 | + @override |
| 68 | + def __init__(self, model_config, *, credential=None, **kwargs): |
| 69 | + current_dir = os.path.dirname(__file__) |
| 70 | + prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) |
| 71 | + super().__init__( |
| 72 | + model_config=model_config, |
| 73 | + prompty_file=prompty_path, |
| 74 | + result_key=self._RESULT_KEY, |
| 75 | + credential=credential, |
| 76 | + **kwargs, |
| 77 | + ) |
| 78 | + |
| 79 | + @overload |
| 80 | + def __call__( |
| 81 | + self, |
| 82 | + *, |
| 83 | + query: Union[str, List[dict]], |
| 84 | + response: Union[str, List[dict]], |
| 85 | + tool_definitions: Optional[Union[dict, List[dict]]] = None, |
| 86 | + ) -> Dict[str, Union[str, bool]]: |
| 87 | + """Evaluate task success for a given query, response, and optionally tool definitions. |
| 88 | + The query and response can be either a string or a list of messages. |
| 89 | +
|
| 90 | +
|
| 91 | + Example with string inputs and no tools: |
| 92 | + evaluator = TaskSuccessEvaluator(model_config) |
| 93 | + query = "Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine." |
| 94 | + response = "**Day 1:** Morning: Louvre Museum, Lunch: Le Comptoir du Relais..." |
| 95 | +
|
| 96 | + result = evaluator(query=query, response=response) |
| 97 | +
|
| 98 | + Example with list of messages: |
| 99 | + evaluator = TaskSuccessEvaluator(model_config) |
| 100 | + query = [{'role': 'system', 'content': 'You are a helpful travel planning assistant.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Plan a 3-day Paris itinerary with cultural landmarks and cuisine'}]}] |
| 101 | + response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}] |
| 102 | + tool_definitions = [{'name': 'get_attractions', 'description': 'Get tourist attractions for a city.', 'parameters': {'type': 'object', 'properties': {'city': {'type': 'string', 'description': 'The city name.'}}}}] |
| 103 | +
|
| 104 | + result = evaluator(query=query, response=response, tool_definitions=tool_definitions) |
| 105 | +
|
| 106 | + :keyword query: The query being evaluated, either a string or a list of messages. |
| 107 | + :paramtype query: Union[str, List[dict]] |
| 108 | + :keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls) |
| 109 | + :paramtype response: Union[str, List[dict]] |
| 110 | + :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of. |
| 111 | + :paramtype tool_definitions: Optional[Union[dict, List[dict]]] |
| 112 | + :return: A dictionary with the task success evaluation results. |
| 113 | + :rtype: Dict[str, Union[str, bool]] |
| 114 | + """ |
| 115 | + |
| 116 | + @override |
| 117 | + def __call__( # pylint: disable=docstring-missing-param |
| 118 | + self, |
| 119 | + *args, |
| 120 | + **kwargs, |
| 121 | + ): |
| 122 | + """ |
| 123 | + Invokes the instance using the overloaded __call__ signature. |
| 124 | +
|
| 125 | + For detailed parameter types and return value documentation, see the overloaded __call__ definition. |
| 126 | + """ |
| 127 | + return super().__call__(*args, **kwargs) |
| 128 | + |
| 129 | + @override |
| 130 | + async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # type: ignore[override] |
| 131 | + """Do Task Success evaluation. |
| 132 | + :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method |
| 133 | + :type eval_input: Dict |
| 134 | + :return: The evaluation result. |
| 135 | + :rtype: Dict |
| 136 | + """ |
| 137 | + # we override the _do_eval method as we want the output to be a dictionary, |
| 138 | + # which is a different schema than _base_prompty_eval.py |
| 139 | + if "query" not in eval_input and "response" not in eval_input: |
| 140 | + raise EvaluationException( |
| 141 | + message=f"Both query and response must be provided as input to the Task Success evaluator.", |
| 142 | + internal_message=f"Both query and response must be provided as input to the Task Success evaluator.", |
| 143 | + blame=ErrorBlame.USER_ERROR, |
| 144 | + category=ErrorCategory.MISSING_FIELD, |
| 145 | + target=ErrorTarget.TASK_SUCCESS_EVALUATOR, |
| 146 | + ) |
| 147 | + eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True) |
| 148 | + eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True) |
| 149 | + if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None: |
| 150 | + eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger) |
| 151 | + |
| 152 | + llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) |
| 153 | + if isinstance(llm_output, dict): |
| 154 | + success = llm_output.get("success", False) |
| 155 | + if isinstance(success, str): |
| 156 | + success = success.upper() == "TRUE" |
| 157 | + |
| 158 | + success_result = "pass" if success == True else "fail" |
| 159 | + reason = llm_output.get("explanation", "") |
| 160 | + return { |
| 161 | + f"{self._result_key}": success, |
| 162 | + f"{self._result_key}_result": success_result, |
| 163 | + f"{self._result_key}_reason": reason, |
| 164 | + f"{self._result_key}_details": llm_output.get("details", ""), |
| 165 | + } |
| 166 | + if logger: |
| 167 | + logger.warning("LLM output is not a dictionary, returning False for the success.") |
| 168 | + return {self._result_key: False} |
0 commit comments