Skip to content

Commit c62954a

Browse files
authored
Tool Output Utilization Evaluator (#43293)
* init boilerplate * eval init * prompt crafting updates * examples update * makeitwork * updates * black * updates * doc updates and formats * update boolean * black * make private * Revert "make private" This reverts commit 5844f2b. * remove print * changes * black * remove optional threshold * remove gpt_ * add samples * mkae it private
1 parent 4052020 commit c62954a

File tree

8 files changed

+649
-23
lines changed

8 files changed

+649
-23
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from ._evaluators._ungrounded_attributes import UngroundedAttributesEvaluator
3333
from ._evaluators._tool_call_accuracy import ToolCallAccuracyEvaluator
3434
from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
35+
from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator
3536
from ._model_configurations import (
3637
AzureAIProject,
3738
AzureOpenAIModelConfiguration,
@@ -131,6 +132,7 @@ def lazy_import():
131132
"CodeVulnerabilityEvaluator",
132133
"UngroundedAttributesEvaluator",
133134
"ToolCallAccuracyEvaluator",
135+
"_ToolOutputUtilizationEvaluator",
134136
"AzureOpenAIGrader",
135137
"AzureOpenAILabelGrader",
136138
"AzureOpenAIStringCheckGrader",

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py

Lines changed: 61 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -580,36 +580,69 @@ def _extract_text_from_content(content):
580580
return text
581581

582582

583-
def _get_conversation_history(query, include_system_messages=False):
584-
all_user_queries = []
585-
cur_user_query = []
586-
all_agent_responses = []
587-
cur_agent_response = []
583+
def filter_to_used_tools(tool_definitions, msgs_lists, logger=None):
584+
"""Filters the tool definitions to only include those that were actually used in the messages lists."""
585+
try:
586+
used_tool_names = set()
587+
any_tools_used = False
588+
for msgs in msgs_lists:
589+
for msg in msgs:
590+
if msg.get("role") == "assistant" and "content" in msg:
591+
for content in msg.get("content", []):
592+
if content.get("type") == "tool_call":
593+
any_tools_used = True
594+
if "tool_call" in content and "function" in content["tool_call"]:
595+
used_tool_names.add(content["tool_call"]["function"])
596+
elif "name" in content:
597+
used_tool_names.add(content["name"])
598+
599+
filtered_tools = [tool for tool in tool_definitions if tool.get("name") in used_tool_names]
600+
if any_tools_used and not filtered_tools:
601+
if logger:
602+
logger.warning("No tool definitions matched the tools used in the messages. Returning original list.")
603+
filtered_tools = tool_definitions
604+
605+
return filtered_tools
606+
except Exception as e:
607+
if logger:
608+
logger.warning(f"Failed to filter tool definitions, returning original list. Error: {e}")
609+
return tool_definitions
610+
611+
612+
def _get_conversation_history(query, include_system_messages=False, include_tool_messages=False):
613+
all_user_queries, all_agent_responses = [], []
614+
cur_user_query, cur_agent_response = [], []
588615
system_message = None
616+
589617
for msg in query:
590-
if not "role" in msg:
618+
role = msg.get("role")
619+
if not role:
591620
continue
592-
if include_system_messages and msg["role"] == "system" and "content" in msg:
621+
if include_system_messages and role == "system":
593622
system_message = msg.get("content", "")
594-
if msg["role"] == "user" and "content" in msg:
595-
if cur_agent_response != []:
596-
all_agent_responses.append(cur_agent_response)
623+
624+
elif role == "user" and "content" in msg:
625+
if cur_agent_response:
626+
formatted_agent_response = _get_agent_response(
627+
cur_agent_response, include_tool_messages=include_tool_messages
628+
)
629+
all_agent_responses.append([formatted_agent_response])
597630
cur_agent_response = []
598631
text_in_msg = _extract_text_from_content(msg["content"])
599632
if text_in_msg:
600633
cur_user_query.append(text_in_msg)
601634

602-
if msg["role"] == "assistant" and "content" in msg:
603-
if cur_user_query != []:
635+
elif role in ("assistant", "tool"):
636+
if cur_user_query:
604637
all_user_queries.append(cur_user_query)
605638
cur_user_query = []
606-
text_in_msg = _extract_text_from_content(msg["content"])
607-
if text_in_msg:
608-
cur_agent_response.append(text_in_msg)
609-
if cur_user_query != []:
639+
cur_agent_response.append(msg)
640+
641+
if cur_user_query:
610642
all_user_queries.append(cur_user_query)
611-
if cur_agent_response != []:
612-
all_agent_responses.append(cur_agent_response)
643+
if cur_agent_response:
644+
formatted_agent_response = _get_agent_response(cur_agent_response, include_tool_messages=include_tool_messages)
645+
all_agent_responses.append([formatted_agent_response])
613646

614647
if len(all_user_queries) != len(all_agent_responses) + 1:
615648
raise EvaluationException(
@@ -619,16 +652,17 @@ def _get_conversation_history(query, include_system_messages=False):
619652
category=ErrorCategory.INVALID_VALUE,
620653
blame=ErrorBlame.USER_ERROR,
621654
)
655+
622656
result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
623-
if include_system_messages:
657+
if include_system_messages and system_message:
624658
result["system_message"] = system_message
625659
return result
626660

627661

628662
def _pretty_format_conversation_history(conversation_history):
629663
"""Formats the conversation history for better readability."""
630664
formatted_history = ""
631-
if "system_message" in conversation_history and conversation_history["system_message"] is not None:
665+
if conversation_history.get("system_message"):
632666
formatted_history += "SYSTEM_PROMPT:\n"
633667
formatted_history += " " + conversation_history["system_message"] + "\n\n"
634668
for i, (user_query, agent_response) in enumerate(
@@ -646,12 +680,16 @@ def _pretty_format_conversation_history(conversation_history):
646680
return formatted_history
647681

648682

649-
def reformat_conversation_history(query, logger=None, include_system_messages=False):
683+
def reformat_conversation_history(query, logger=None, include_system_messages=False, include_tool_messages=False):
650684
"""Reformats the conversation history to a more compact representation."""
651685
try:
652-
conversation_history = _get_conversation_history(query, include_system_messages=include_system_messages)
686+
conversation_history = _get_conversation_history(
687+
query,
688+
include_system_messages=include_system_messages,
689+
include_tool_messages=include_tool_messages,
690+
)
653691
return _pretty_format_conversation_history(conversation_history)
654-
except:
692+
except Exception as e:
655693
# If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
656694
# This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected.
657695
# From our tests the negative impact on IntentResolution is:
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
from ._tool_output_utilization import _ToolOutputUtilizationEvaluator
6+
7+
__all__ = ["_ToolOutputUtilizationEvaluator"]
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
import os
5+
import math
6+
import logging
7+
from typing import Dict, Union, List, Optional
8+
9+
from typing_extensions import overload, override
10+
11+
from azure.ai.evaluation._exceptions import (
12+
EvaluationException,
13+
ErrorBlame,
14+
ErrorCategory,
15+
ErrorTarget,
16+
)
17+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
18+
from ..._common.utils import (
19+
reformat_conversation_history,
20+
reformat_agent_response,
21+
reformat_tool_definitions,
22+
filter_to_used_tools,
23+
)
24+
from azure.ai.evaluation._model_configurations import Message
25+
from azure.ai.evaluation._common._experimental import experimental
26+
27+
logger = logging.getLogger(__name__)
28+
29+
30+
@experimental
31+
class _ToolOutputUtilizationEvaluator(PromptyEvaluatorBase[Union[str, float]]):
32+
"""The Tool Output Utilization Evaluator assesses how effectively an AI agent utilizes the outputs from tools and whether it accurately incorporates this information into its responses.
33+
34+
Scoring is based on two levels:
35+
1. Pass - The agent effectively utilizes tool outputs and accurately incorporates the information into its response.
36+
2. Fail - The agent fails to properly utilize tool outputs or incorrectly incorporates the information into its response.
37+
38+
The evaluation includes the score, a brief explanation, and a final pass/fail result.
39+
40+
41+
:param model_config: Configuration for the Azure OpenAI model.
42+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
43+
~azure.ai.evaluation.OpenAIModelConfiguration]
44+
45+
.. admonition:: Example:
46+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
47+
:start-after: [START tool_output_utilization_evaluator]
48+
:end-before: [END tool_output_utilization_evaluator]
49+
:language: python
50+
:dedent: 8
51+
:caption: Initialize and call a _ToolOutputUtilizationEvaluator with a query and response.
52+
53+
.. admonition:: Example using Azure AI Project URL:
54+
55+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
56+
:start-after: [START tool_output_utilization_evaluator]
57+
:end-before: [END tool_output_utilization_evaluator]
58+
:language: python
59+
:dedent: 8
60+
:caption: Initialize and call _ToolOutputUtilizationEvaluator using Azure AI Project URL in the following format
61+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
62+
63+
"""
64+
65+
_PROMPTY_FILE = "tool_output_utilization.prompty"
66+
_RESULT_KEY = "tool_output_utilization"
67+
_OPTIONAL_PARAMS = ["tool_definitions"]
68+
69+
_DEFAULT_TOOL_OUTPUT_UTILIZATION_SCORE = 1
70+
71+
id = "azureai://built-in/evaluators/tool_output_utilization"
72+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
73+
74+
@override
75+
def __init__(
76+
self,
77+
model_config,
78+
*,
79+
credential=None,
80+
**kwargs,
81+
):
82+
current_dir = os.path.dirname(__file__)
83+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
84+
super().__init__(
85+
model_config=model_config,
86+
prompty_file=prompty_path,
87+
result_key=self._RESULT_KEY,
88+
threshold=self._DEFAULT_TOOL_OUTPUT_UTILIZATION_SCORE,
89+
credential=credential,
90+
_higher_is_better=True,
91+
**kwargs,
92+
)
93+
94+
@overload
95+
def __call__(
96+
self,
97+
*,
98+
query: Union[str, List[dict]],
99+
response: Union[str, List[dict]],
100+
tool_definitions: Union[dict, List[dict]],
101+
) -> Dict[str, Union[str, float]]:
102+
"""Evaluate tool output utilization for a given query, response, and optional tool defintions.
103+
The query and response can be either a string or a list of messages.
104+
105+
106+
Example with string inputs and no tools:
107+
evaluator = _ToolOutputUtilizationEvaluator(model_config)
108+
query = "What is the weather today?"
109+
response = "The weather is sunny."
110+
111+
result = evaluator(query=query, response=response)
112+
113+
Example with list of messages:
114+
evaluator = _ToolOutputUtilizationEvaluator(model_config)
115+
query = [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}]
116+
response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}]
117+
tool_definitions = [{'name': 'get_orders', 'description': 'Get the list of orders for a given account number.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to get the orders for.'}}}}, {'name': 'get_order', 'description': 'Get the details of a specific order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID to get the details for.'}}}}, {'name': 'initiate_return', 'description': 'Initiate the return process for an order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID for the return process.'}}}}, {'name': 'update_shipping_address', 'description': 'Update the shipping address for a given account.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to update.'}, 'new_address': {'type': 'string', 'description': 'The new shipping address.'}}}}]
118+
119+
result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
120+
121+
:keyword query: The query being evaluated, either a string or a list of messages.
122+
:paramtype query: Union[str, List[dict]]
123+
:keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls)
124+
:paramtype response: Union[str, List[dict]]
125+
:keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
126+
:paramtype tool_definitions: Union[dict, List[dict]]
127+
:return: A dictionary with the tool output utilization evaluation results.
128+
:rtype: Dict[str, Union[str, float]]
129+
"""
130+
131+
@override
132+
def __call__( # pylint: disable=docstring-missing-param
133+
self,
134+
*args,
135+
**kwargs,
136+
):
137+
"""
138+
Invokes the instance using the overloaded __call__ signature.
139+
140+
For detailed parameter types and return value documentation, see the overloaded __call__ definition.
141+
"""
142+
return super().__call__(*args, **kwargs)
143+
144+
@override
145+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
146+
"""Do Tool Output Utilization evaluation.
147+
:param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
148+
:type eval_input: Dict
149+
:return: The evaluation result.
150+
:rtype: Dict
151+
"""
152+
# we override the _do_eval method as we want the output to be a dictionary,
153+
# which is a different schema than _base_prompty_eval.py
154+
if ("query" not in eval_input) and ("response" not in eval_input) and ("tool_definitions" not in eval_input):
155+
raise EvaluationException(
156+
message="Query, response, and tool_definitions are required inputs to the Tool Output Utilization evaluator.",
157+
internal_message="Query, response, and tool_definitions are required inputs to the Tool Output Utilization evaluator.",
158+
blame=ErrorBlame.USER_ERROR,
159+
category=ErrorCategory.MISSING_FIELD,
160+
target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR,
161+
)
162+
163+
tool_definitions = eval_input["tool_definitions"]
164+
filtered_tool_definitions = filter_to_used_tools(
165+
tool_definitions=tool_definitions,
166+
msgs_lists=[eval_input["query"], eval_input["response"]],
167+
logger=logger,
168+
)
169+
eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger)
170+
171+
eval_input["query"] = reformat_conversation_history(
172+
eval_input["query"],
173+
logger,
174+
include_system_messages=True,
175+
include_tool_messages=True,
176+
)
177+
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
178+
179+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
180+
llm_output = prompty_output_dict.get("llm_output", "")
181+
if isinstance(llm_output, dict):
182+
output_label = llm_output.get("label", None)
183+
if output_label is None:
184+
if logger:
185+
logger.warning("LLM output does not contain 'label' key, returning NaN for the score.")
186+
output_label = "fail"
187+
188+
output_label = output_label.lower()
189+
if output_label not in ["pass", "fail"]:
190+
if logger:
191+
logger.warning(
192+
f"LLM output label is not 'pass' or 'fail' (got '{output_label}'), returning NaN for the score."
193+
)
194+
195+
score = 1.0 if output_label == "pass" else 0.0
196+
score_result = output_label
197+
reason = llm_output.get("reason", "")
198+
199+
faulty_details = llm_output.get("faulty_details", [])
200+
if faulty_details:
201+
reason += " Issues found: " + "; ".join(faulty_details)
202+
203+
return {
204+
f"{self._result_key}": score,
205+
f"{self._result_key}_reason": reason,
206+
f"{self._result_key}_result": score_result,
207+
f"{self._result_key}_threshold": self._threshold,
208+
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
209+
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
210+
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
211+
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
212+
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
213+
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
214+
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
215+
}
216+
if logger:
217+
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
218+
219+
score = math.nan
220+
binary_result = self._get_binary_result(score)
221+
return {
222+
self._result_key: float(score),
223+
f"{self._result_key}_result": binary_result,
224+
f"{self._result_key}_threshold": self._threshold,
225+
}

0 commit comments

Comments
 (0)