|
| 1 | +# --------------------------------------------------------- |
| 2 | +# Copyright (c) Microsoft Corporation. All rights reserved. |
| 3 | +# --------------------------------------------------------- |
| 4 | +import os |
| 5 | +import math |
| 6 | +import logging |
| 7 | +from typing import Dict, Union, List, Optional |
| 8 | + |
| 9 | +from typing_extensions import overload, override |
| 10 | + |
| 11 | +from azure.ai.evaluation._exceptions import ( |
| 12 | + EvaluationException, |
| 13 | + ErrorBlame, |
| 14 | + ErrorCategory, |
| 15 | + ErrorTarget, |
| 16 | +) |
| 17 | +from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase |
| 18 | +from ..._common.utils import ( |
| 19 | + reformat_conversation_history, |
| 20 | + reformat_agent_response, |
| 21 | + reformat_tool_definitions, |
| 22 | + filter_to_used_tools, |
| 23 | +) |
| 24 | +from azure.ai.evaluation._model_configurations import Message |
| 25 | +from azure.ai.evaluation._common._experimental import experimental |
| 26 | + |
| 27 | +logger = logging.getLogger(__name__) |
| 28 | + |
| 29 | + |
| 30 | +@experimental |
| 31 | +class _ToolOutputUtilizationEvaluator(PromptyEvaluatorBase[Union[str, float]]): |
| 32 | + """The Tool Output Utilization Evaluator assesses how effectively an AI agent utilizes the outputs from tools and whether it accurately incorporates this information into its responses. |
| 33 | +
|
| 34 | + Scoring is based on two levels: |
| 35 | + 1. Pass - The agent effectively utilizes tool outputs and accurately incorporates the information into its response. |
| 36 | + 2. Fail - The agent fails to properly utilize tool outputs or incorrectly incorporates the information into its response. |
| 37 | +
|
| 38 | + The evaluation includes the score, a brief explanation, and a final pass/fail result. |
| 39 | +
|
| 40 | +
|
| 41 | + :param model_config: Configuration for the Azure OpenAI model. |
| 42 | + :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, |
| 43 | + ~azure.ai.evaluation.OpenAIModelConfiguration] |
| 44 | +
|
| 45 | + .. admonition:: Example: |
| 46 | + .. literalinclude:: ../samples/evaluation_samples_evaluate.py |
| 47 | + :start-after: [START tool_output_utilization_evaluator] |
| 48 | + :end-before: [END tool_output_utilization_evaluator] |
| 49 | + :language: python |
| 50 | + :dedent: 8 |
| 51 | + :caption: Initialize and call a _ToolOutputUtilizationEvaluator with a query and response. |
| 52 | +
|
| 53 | + .. admonition:: Example using Azure AI Project URL: |
| 54 | +
|
| 55 | + .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py |
| 56 | + :start-after: [START tool_output_utilization_evaluator] |
| 57 | + :end-before: [END tool_output_utilization_evaluator] |
| 58 | + :language: python |
| 59 | + :dedent: 8 |
| 60 | + :caption: Initialize and call _ToolOutputUtilizationEvaluator using Azure AI Project URL in the following format |
| 61 | + https://{resource_name}.services.ai.azure.com/api/projects/{project_name} |
| 62 | +
|
| 63 | + """ |
| 64 | + |
| 65 | + _PROMPTY_FILE = "tool_output_utilization.prompty" |
| 66 | + _RESULT_KEY = "tool_output_utilization" |
| 67 | + _OPTIONAL_PARAMS = ["tool_definitions"] |
| 68 | + |
| 69 | + _DEFAULT_TOOL_OUTPUT_UTILIZATION_SCORE = 1 |
| 70 | + |
| 71 | + id = "azureai://built-in/evaluators/tool_output_utilization" |
| 72 | + """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" |
| 73 | + |
| 74 | + @override |
| 75 | + def __init__( |
| 76 | + self, |
| 77 | + model_config, |
| 78 | + *, |
| 79 | + credential=None, |
| 80 | + **kwargs, |
| 81 | + ): |
| 82 | + current_dir = os.path.dirname(__file__) |
| 83 | + prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) |
| 84 | + super().__init__( |
| 85 | + model_config=model_config, |
| 86 | + prompty_file=prompty_path, |
| 87 | + result_key=self._RESULT_KEY, |
| 88 | + threshold=self._DEFAULT_TOOL_OUTPUT_UTILIZATION_SCORE, |
| 89 | + credential=credential, |
| 90 | + _higher_is_better=True, |
| 91 | + **kwargs, |
| 92 | + ) |
| 93 | + |
| 94 | + @overload |
| 95 | + def __call__( |
| 96 | + self, |
| 97 | + *, |
| 98 | + query: Union[str, List[dict]], |
| 99 | + response: Union[str, List[dict]], |
| 100 | + tool_definitions: Union[dict, List[dict]], |
| 101 | + ) -> Dict[str, Union[str, float]]: |
| 102 | + """Evaluate tool output utilization for a given query, response, and optional tool defintions. |
| 103 | + The query and response can be either a string or a list of messages. |
| 104 | +
|
| 105 | +
|
| 106 | + Example with string inputs and no tools: |
| 107 | + evaluator = _ToolOutputUtilizationEvaluator(model_config) |
| 108 | + query = "What is the weather today?" |
| 109 | + response = "The weather is sunny." |
| 110 | +
|
| 111 | + result = evaluator(query=query, response=response) |
| 112 | +
|
| 113 | + Example with list of messages: |
| 114 | + evaluator = _ToolOutputUtilizationEvaluator(model_config) |
| 115 | + query = [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}] |
| 116 | + response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}] |
| 117 | + tool_definitions = [{'name': 'get_orders', 'description': 'Get the list of orders for a given account number.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to get the orders for.'}}}}, {'name': 'get_order', 'description': 'Get the details of a specific order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID to get the details for.'}}}}, {'name': 'initiate_return', 'description': 'Initiate the return process for an order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID for the return process.'}}}}, {'name': 'update_shipping_address', 'description': 'Update the shipping address for a given account.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to update.'}, 'new_address': {'type': 'string', 'description': 'The new shipping address.'}}}}] |
| 118 | +
|
| 119 | + result = evaluator(query=query, response=response, tool_definitions=tool_definitions) |
| 120 | +
|
| 121 | + :keyword query: The query being evaluated, either a string or a list of messages. |
| 122 | + :paramtype query: Union[str, List[dict]] |
| 123 | + :keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls) |
| 124 | + :paramtype response: Union[str, List[dict]] |
| 125 | + :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of. |
| 126 | + :paramtype tool_definitions: Union[dict, List[dict]] |
| 127 | + :return: A dictionary with the tool output utilization evaluation results. |
| 128 | + :rtype: Dict[str, Union[str, float]] |
| 129 | + """ |
| 130 | + |
| 131 | + @override |
| 132 | + def __call__( # pylint: disable=docstring-missing-param |
| 133 | + self, |
| 134 | + *args, |
| 135 | + **kwargs, |
| 136 | + ): |
| 137 | + """ |
| 138 | + Invokes the instance using the overloaded __call__ signature. |
| 139 | +
|
| 140 | + For detailed parameter types and return value documentation, see the overloaded __call__ definition. |
| 141 | + """ |
| 142 | + return super().__call__(*args, **kwargs) |
| 143 | + |
| 144 | + @override |
| 145 | + async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override] |
| 146 | + """Do Tool Output Utilization evaluation. |
| 147 | + :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method |
| 148 | + :type eval_input: Dict |
| 149 | + :return: The evaluation result. |
| 150 | + :rtype: Dict |
| 151 | + """ |
| 152 | + # we override the _do_eval method as we want the output to be a dictionary, |
| 153 | + # which is a different schema than _base_prompty_eval.py |
| 154 | + if ("query" not in eval_input) and ("response" not in eval_input) and ("tool_definitions" not in eval_input): |
| 155 | + raise EvaluationException( |
| 156 | + message="Query, response, and tool_definitions are required inputs to the Tool Output Utilization evaluator.", |
| 157 | + internal_message="Query, response, and tool_definitions are required inputs to the Tool Output Utilization evaluator.", |
| 158 | + blame=ErrorBlame.USER_ERROR, |
| 159 | + category=ErrorCategory.MISSING_FIELD, |
| 160 | + target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, |
| 161 | + ) |
| 162 | + |
| 163 | + tool_definitions = eval_input["tool_definitions"] |
| 164 | + filtered_tool_definitions = filter_to_used_tools( |
| 165 | + tool_definitions=tool_definitions, |
| 166 | + msgs_lists=[eval_input["query"], eval_input["response"]], |
| 167 | + logger=logger, |
| 168 | + ) |
| 169 | + eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger) |
| 170 | + |
| 171 | + eval_input["query"] = reformat_conversation_history( |
| 172 | + eval_input["query"], |
| 173 | + logger, |
| 174 | + include_system_messages=True, |
| 175 | + include_tool_messages=True, |
| 176 | + ) |
| 177 | + eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True) |
| 178 | + |
| 179 | + prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) |
| 180 | + llm_output = prompty_output_dict.get("llm_output", "") |
| 181 | + if isinstance(llm_output, dict): |
| 182 | + output_label = llm_output.get("label", None) |
| 183 | + if output_label is None: |
| 184 | + if logger: |
| 185 | + logger.warning("LLM output does not contain 'label' key, returning NaN for the score.") |
| 186 | + output_label = "fail" |
| 187 | + |
| 188 | + output_label = output_label.lower() |
| 189 | + if output_label not in ["pass", "fail"]: |
| 190 | + if logger: |
| 191 | + logger.warning( |
| 192 | + f"LLM output label is not 'pass' or 'fail' (got '{output_label}'), returning NaN for the score." |
| 193 | + ) |
| 194 | + |
| 195 | + score = 1.0 if output_label == "pass" else 0.0 |
| 196 | + score_result = output_label |
| 197 | + reason = llm_output.get("reason", "") |
| 198 | + |
| 199 | + faulty_details = llm_output.get("faulty_details", []) |
| 200 | + if faulty_details: |
| 201 | + reason += " Issues found: " + "; ".join(faulty_details) |
| 202 | + |
| 203 | + return { |
| 204 | + f"{self._result_key}": score, |
| 205 | + f"{self._result_key}_reason": reason, |
| 206 | + f"{self._result_key}_result": score_result, |
| 207 | + f"{self._result_key}_threshold": self._threshold, |
| 208 | + f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0), |
| 209 | + f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0), |
| 210 | + f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0), |
| 211 | + f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""), |
| 212 | + f"{self._result_key}_model": prompty_output_dict.get("model_id", ""), |
| 213 | + f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""), |
| 214 | + f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""), |
| 215 | + } |
| 216 | + if logger: |
| 217 | + logger.warning("LLM output is not a dictionary, returning NaN for the score.") |
| 218 | + |
| 219 | + score = math.nan |
| 220 | + binary_result = self._get_binary_result(score) |
| 221 | + return { |
| 222 | + self._result_key: float(score), |
| 223 | + f"{self._result_key}_result": binary_result, |
| 224 | + f"{self._result_key}_threshold": self._threshold, |
| 225 | + } |
0 commit comments