Skip to content

Commit bcec119

Browse files
salma-elshafeySalma Elshafey
andauthored
Add support for Built-in Tools for the Tool Call Accuracy Evaluator (#42359)
* support 5 levels, evaluate all tools at once * Update sample notebook and change log * Add missing import * Modify test cases to match the new output format * Modify other test file to match the new output format * Fixed parsing of results * Change key name in output * Spell check fixes * Minor prompt update * Update result key to tool_call_accuracy * Delete test_new_evaluator.ipynb * Added field names and messages as constants * Additional note in prompt * Re-add the temperature to the prompty file * Removed 'applicable' field and print statement * Move excess/missing tool calls fields under additional details * Typo fix and removal of redundant field in the prompt * Modify per_tool_call_details field's name to details * Revert "Modify per_tool_call_details field's name to details" This reverts commit 2c3ce50. * Revert 'Merge branch 'main' into selshafey/improve_tool_call_accuracy' * Black reformat * Reformat with black * To re-trigger build pipelines * Add notebook for bugbash * modify bugbash notebook * Add support for built-in tools for Tool Call Accuracy Evaluator * Remove bugbash notebook * Resolve issues with merge * Fix id value * Use existing built-in tool definitions * Run black * Prompt modifications * Add test cases for built-in tools * Handle converter format * Add test cases for converter format * Support only converter format * Revert tool definitions to be required, run black --------- Co-authored-by: Salma Elshafey <[email protected]>
1 parent 78e1967 commit bcec119

File tree

4 files changed

+399
-77
lines changed

4 files changed

+399
-77
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,12 @@
3333
# Constants to only be used internally in this file for the built-in tools.
3434
_CODE_INTERPRETER = "code_interpreter"
3535
_BING_GROUNDING = "bing_grounding"
36+
_BING_CUSTOM_SEARCH = "bing_custom_search"
3637
_FILE_SEARCH = "file_search"
3738
_AZURE_AI_SEARCH = "azure_ai_search"
39+
_SHAREPOINT_GROUNDING = "sharepoint_grounding"
3840
_FABRIC_DATAAGENT = "fabric_dataagent"
41+
_OPENAPI = "openapi"
3942

4043
# Built-in tool descriptions and parameters are hidden, but we include basic descriptions
4144
# for evaluation purposes.
@@ -44,9 +47,12 @@
4447
+ "generate code, and create graphs and charts using your data. Supports "
4548
+ "up to 20 files.",
4649
_BING_GROUNDING: "Enhance model output with web data.",
47-
_FILE_SEARCH: "Search for data across uploaded files.",
50+
_BING_CUSTOM_SEARCH: "Enables agents to retrieve content from a curated subset of websites, enhancing relevance and reducing noise from public web searches.",
51+
_FILE_SEARCH: "Search for data across uploaded files. A single call can return multiple results/files in the 'results' field.",
4852
_AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
53+
_SHAREPOINT_GROUNDING: "Allows agents to access and retrieve relevant content from Microsoft SharePoint document libraries, grounding responses in organizational knowledge.",
4954
_FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
55+
_OPENAPI: "Connects agents to external RESTful APIs using OpenAPI 3.0 specifications, enabling seamless access to third-party services.",
5056
}
5157

5258
# Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes.
@@ -59,6 +65,15 @@
5965
"type": "object",
6066
"properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
6167
},
68+
_BING_CUSTOM_SEARCH: {
69+
"type": "object",
70+
"properties": {
71+
"requesturl": {
72+
"type": "string",
73+
"description": "Search queries, along with pre-configured site restrictions or domain filters.",
74+
}
75+
},
76+
},
6277
_FILE_SEARCH: {
6378
"type": "object",
6479
"properties": {
@@ -76,10 +91,23 @@
7691
"type": "object",
7792
"properties": {"input": {"type": "string", "description": "Search terms to use."}},
7893
},
94+
_SHAREPOINT_GROUNDING: {
95+
"type": "object",
96+
"properties": {
97+
"input": {"type": "string", "description": "A natural language query to search SharePoint content."}
98+
},
99+
},
79100
_FABRIC_DATAAGENT: {
80101
"type": "object",
81102
"properties": {"input": {"type": "string", "description": "Search terms to use."}},
82103
},
104+
_OPENAPI: {
105+
"type": "object",
106+
"properties": {
107+
"name": {"type": "string", "description": "The name of the function to call."},
108+
"arguments": {"type": "string", "description": "JSON string of the arguments to pass to the function."},
109+
},
110+
},
83111
}
84112

85113

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py

Lines changed: 93 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import os
66
import logging
77
import re
8-
from typing import Dict, List, Union, TypeVar, cast
8+
from typing import Dict, List, Union, TypeVar, Optional
99
from typing_extensions import overload, override
1010
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
1111
from azure.ai.evaluation._exceptions import (
@@ -16,12 +16,46 @@
1616
)
1717
from ..._common.utils import check_score_is_valid
1818
from azure.ai.evaluation._common._experimental import experimental
19+
from ..._converters._models import (
20+
_BUILT_IN_DESCRIPTIONS,
21+
_BUILT_IN_PARAMS,
22+
)
1923

2024
logger = logging.getLogger(__name__)
2125

2226
T_EvalValue = TypeVar("T_EvalValue")
2327

2428

29+
def _get_built_in_definition(tool_name: str):
30+
"""Get the definition for the built-in tool."""
31+
if tool_name in _BUILT_IN_DESCRIPTIONS:
32+
return {
33+
"type": tool_name,
34+
"description": _BUILT_IN_DESCRIPTIONS[tool_name],
35+
"name": tool_name,
36+
"parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
37+
}
38+
return None
39+
40+
41+
def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]:
42+
"""Extract tool definitions needed for the given built-in tool calls."""
43+
needed_definitions = []
44+
for tool_call in tool_calls:
45+
if isinstance(tool_call, dict):
46+
tool_type = tool_call.get("type")
47+
48+
# Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
49+
if tool_type == "tool_call":
50+
tool_name = tool_call.get("name")
51+
if tool_name in _BUILT_IN_DESCRIPTIONS:
52+
built_in_def = _get_built_in_definition(tool_name)
53+
if built_in_def and built_in_def not in needed_definitions:
54+
needed_definitions.append(built_in_def)
55+
56+
return needed_definitions
57+
58+
2559
@experimental
2660
class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
2761
"""The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
@@ -153,10 +187,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
153187
# TODO add warning that only tool calls of type function are supported
154188
# Collect inputs
155189
tool_calls = kwargs.get("tool_calls")
156-
tool_definitions = kwargs.get("tool_definitions")
190+
tool_definitions = kwargs.get("tool_definitions", []) # Default to empty list
157191
query = kwargs.get("query")
158192
response = kwargs.get("response")
159-
160193
# TODO : Support classes that represents tool calls, messages etc once client side definitions are available
161194
if response:
162195
parsed_tool_calls = self._parse_tools_from_response(response)
@@ -165,20 +198,23 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
165198

166199
if not tool_calls:
167200
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
168-
if not tool_definitions or len(tool_definitions) == 0:
169-
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
170201

171202
if not isinstance(tool_calls, list):
172203
tool_calls = [tool_calls]
173204
if not isinstance(tool_definitions, list):
174-
tool_definitions = [tool_definitions]
205+
tool_definitions = [tool_definitions] if tool_definitions else []
175206

176207
try:
177208
needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions)
178209
except EvaluationException as e:
179-
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
210+
# Check if this is because no tool definitions were provided at all
211+
if len(tool_definitions) == 0:
212+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
213+
else:
214+
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
215+
180216
if len(needed_tool_definitions) == 0:
181-
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
217+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
182218

183219
return {
184220
"query": query,
@@ -269,32 +305,63 @@ def _not_applicable_result(self, error_message):
269305
}
270306

271307
def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
272-
"""Extract the tool definitions that are needed for the provided tool calls.
273-
:param tool_calls: List of tool calls to evaluate.
274-
:type tool_calls: List[dict]
275-
:param tool_definitions: List of tool definitions to use for evaluation.
276-
:type tool_definitions: List[dict]
277-
:return: List of tool definitions that are needed for the provided tool calls.
278-
:rtype: List[dict]
279-
"""
308+
"""Extract the tool definitions that are needed for the provided tool calls."""
280309
needed_tool_definitions = []
310+
311+
# Add all user-provided tool definitions
312+
needed_tool_definitions.extend(tool_definitions)
313+
314+
# Add the needed built-in tool definitions (if they are called)
315+
built_in_definitions = _get_needed_built_in_definitions(tool_calls)
316+
needed_tool_definitions.extend(built_in_definitions)
317+
318+
# Validate that all tool calls have corresponding definitions
281319
for tool_call in tool_calls:
282-
if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":
283-
tool_name = tool_call.get("name")
284-
tool_definition = [
285-
tool
286-
for tool in tool_definitions
287-
if tool.get("name") == tool_name and tool.get("type", "function") == "function"
288-
]
289-
if len(tool_definition) > 0:
290-
needed_tool_definitions.extend(tool_definition)
320+
if isinstance(tool_call, dict):
321+
tool_type = tool_call.get("type")
322+
323+
if tool_type == "tool_call":
324+
tool_name = tool_call.get("name")
325+
if tool_name and tool_name in _BUILT_IN_DESCRIPTIONS:
326+
# This is a built-in tool from converter, already handled above
327+
continue
328+
elif tool_name:
329+
# This is a regular function tool from converter
330+
tool_definition_exists = any(
331+
tool.get("name") == tool_name and tool.get("type", "function") == "function"
332+
for tool in tool_definitions
333+
)
334+
if not tool_definition_exists:
335+
raise EvaluationException(
336+
message=f"Tool definition for {tool_name} not found",
337+
blame=ErrorBlame.USER_ERROR,
338+
category=ErrorCategory.INVALID_VALUE,
339+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
340+
)
341+
else:
342+
raise EvaluationException(
343+
message=f"Tool call missing name: {tool_call}",
344+
blame=ErrorBlame.USER_ERROR,
345+
category=ErrorCategory.INVALID_VALUE,
346+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
347+
)
291348
else:
349+
# Unsupported tool format - only converter format is supported
292350
raise EvaluationException(
293-
message=f"Tool definition for {tool_name} not found",
351+
message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
294352
blame=ErrorBlame.USER_ERROR,
295353
category=ErrorCategory.INVALID_VALUE,
296354
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
297355
)
356+
else:
357+
# Tool call is not a dictionary
358+
raise EvaluationException(
359+
message=f"Tool call is not a dictionary: {tool_call}",
360+
blame=ErrorBlame.USER_ERROR,
361+
category=ErrorCategory.INVALID_VALUE,
362+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
363+
)
364+
298365
return needed_tool_definitions
299366

300367
@override

0 commit comments

Comments
 (0)