Skip to content

Commit e453b20

Browse files
update tool accuracy for new behavior around built-in tools (Azure#40829)
* update tool accuracy for new behavior around built-in tools * include changes for converter * fix imports * test tool definition conversion * move constants to base eval * check yourself * assistant_id --> agent_id * off by one version, fix --------- Co-authored-by: spon <[email protected]>
1 parent 9fe6528 commit e453b20

File tree

7 files changed

+749
-58
lines changed

7 files changed

+749
-58
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_ai_services.py

Lines changed: 3 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from azure.ai.evaluation._common._experimental import experimental
1616

1717
# Constants.
18-
from ._models import _USER, _AGENT, _TOOL, _TOOL_CALL, _TOOL_CALLS, _FUNCTION
18+
from ._models import _USER, _AGENT, _TOOL, _TOOL_CALL, _TOOL_CALLS, _FUNCTION, _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
1919

2020
# Message instances.
2121
from ._models import Message, SystemMessage, UserMessage, AssistantMessage, ToolCall
@@ -32,46 +32,6 @@
3232
# Maximum number of workers allowed to make API calls at the same time.
3333
_MAX_WORKERS = 10
3434

35-
# Constants to only be used internally in this file for the built-in tools.
36-
_CODE_INTERPRETER = "code_interpreter"
37-
_BING_GROUNDING = "bing_grounding"
38-
_FILE_SEARCH = "file_search"
39-
40-
# Built-in tool descriptions and parameters are hidden, but we include basic descriptions
41-
# for evaluation purposes.
42-
_BUILT_IN_DESCRIPTIONS = {
43-
_CODE_INTERPRETER: "Use code interpreter to read and interpret information from datasets, "
44-
+ "generate code, and create graphs and charts using your data. Supports "
45-
+ "up to 20 files.",
46-
_BING_GROUNDING: "Enhance model output with web data.",
47-
_FILE_SEARCH: "Search for data across uploaded files.",
48-
}
49-
50-
# Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes.
51-
_BUILT_IN_PARAMS = {
52-
_CODE_INTERPRETER: {
53-
"type": "object",
54-
"properties": {"input": {"type": "string", "description": "Generated code to be executed."}},
55-
},
56-
_BING_GROUNDING: {
57-
"type": "object",
58-
"properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
59-
},
60-
_FILE_SEARCH: {
61-
"type": "object",
62-
"properties": {
63-
"ranking_options": {
64-
"type": "object",
65-
"properties": {
66-
"ranker": {"type": "string", "description": "Ranking algorithm to use."},
67-
"score_threshold": {"type": "number", "description": "Threshold for search results."},
68-
},
69-
"description": "Ranking options for search results.",
70-
}
71-
},
72-
},
73-
}
74-
7535
@experimental
7636
class AIAgentConverter:
7737
"""
@@ -202,6 +162,7 @@ def _extract_function_tool_definitions(thread_run: ThreadRun) -> List[ToolDefini
202162

203163
final_tools.append(
204164
ToolDefinition(
165+
type="function",
205166
name=tool_function.name,
206167
description=tool_function.description,
207168
parameters=parameters,
@@ -213,6 +174,7 @@ def _extract_function_tool_definitions(thread_run: ThreadRun) -> List[ToolDefini
213174
if tool.type in _BUILT_IN_DESCRIPTIONS and tool.type in _BUILT_IN_PARAMS:
214175
final_tools.append(
215176
ToolDefinition(
177+
type=tool.type,
216178
name=tool.type,
217179
description=_BUILT_IN_DESCRIPTIONS[tool.type],
218180
parameters=_BUILT_IN_PARAMS[tool.type],

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py

Lines changed: 65 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,57 @@
2121
# This is returned by AI services in the API to filter against tool invocations.
2222
_TOOL_CALLS = "tool_calls"
2323

24+
# Constants to only be used internally in this file for the built-in tools.
25+
_CODE_INTERPRETER = "code_interpreter"
26+
_BING_GROUNDING = "bing_grounding"
27+
_FILE_SEARCH = "file_search"
28+
_AZURE_AI_SEARCH = "azure_ai_search"
29+
_FABRIC_DATAAGENT = "fabric_dataagent"
30+
31+
# Built-in tool descriptions and parameters are hidden, but we include basic descriptions
32+
# for evaluation purposes.
33+
_BUILT_IN_DESCRIPTIONS = {
34+
_CODE_INTERPRETER: "Use code interpreter to read and interpret information from datasets, "
35+
+ "generate code, and create graphs and charts using your data. Supports "
36+
+ "up to 20 files.",
37+
_BING_GROUNDING: "Enhance model output with web data.",
38+
_FILE_SEARCH: "Search for data across uploaded files.",
39+
_AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
40+
_FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
41+
}
42+
43+
# Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes.
44+
_BUILT_IN_PARAMS = {
45+
_CODE_INTERPRETER: {
46+
"type": "object",
47+
"properties": {"input": {"type": "string", "description": "Generated code to be executed."}},
48+
},
49+
_BING_GROUNDING: {
50+
"type": "object",
51+
"properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
52+
},
53+
_FILE_SEARCH: {
54+
"type": "object",
55+
"properties": {
56+
"ranking_options": {
57+
"type": "object",
58+
"properties": {
59+
"ranker": {"type": "string", "description": "Ranking algorithm to use."},
60+
"score_threshold": {"type": "number", "description": "Threshold for search results."},
61+
},
62+
"description": "Ranking options for search results.",
63+
}
64+
},
65+
},
66+
_AZURE_AI_SEARCH: {
67+
"type": "object",
68+
"properties": {"input": {"type": "string", "description": "Search terms to use."}},
69+
},
70+
_FABRIC_DATAAGENT: {
71+
"type": "object",
72+
"properties": {"input": {"type": "string", "description": "Search terms to use."}},
73+
},
74+
}
2475

2576
class Message(BaseModel):
2677
"""Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
@@ -98,13 +149,16 @@ class ToolDefinition(BaseModel):
98149
99150
:param name: The name of the tool.
100151
:type name: str
152+
:param type: The type of the tool.
153+
:type type: str
101154
:param description: A description of the tool.
102155
:type description: str
103156
:param parameters: The parameters required by the tool.
104157
:type parameters: dict
105158
"""
106159

107160
name: str
161+
type: str
108162
description: Optional[str] = None
109163
parameters: dict
110164

@@ -191,6 +245,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
191245
arguments = {
192246
"ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]}
193247
}
248+
elif tool_call.details["type"] == "azure_ai_search":
249+
arguments = {"input": tool_call.details["azure_ai_search"]["input"]}
250+
elif tool_call.details["type"] == "fabric_dataagent":
251+
arguments = {"input": tool_call.details["fabric_dataagent"]["input"]}
194252
else:
195253
# unsupported tool type, skip
196254
return messages
@@ -217,11 +275,11 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
217275
# Some built-ins may have output, others may not
218276
# Try to retrieve it, but if we don't find anything, skip adding the message
219277
# Just manually converting to dicts for easy serialization for now rather than custom serializers
220-
if tool_call.details.type == "code_interpreter":
278+
if tool_call.details.type == _CODE_INTERPRETER:
221279
output = tool_call.details.code_interpreter.outputs
222-
elif tool_call.details.type == "bing_grounding":
280+
elif tool_call.details.type == _BING_GROUNDING:
223281
return messages # not supported yet from bing grounding tool
224-
elif tool_call.details.type == "file_search":
282+
elif tool_call.details.type == _FILE_SEARCH:
225283
output = [
226284
{
227285
"file_id": result.file_id,
@@ -231,6 +289,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
231289
}
232290
for result in tool_call.details.file_search.results
233291
]
292+
elif tool_call.details.type == _AZURE_AI_SEARCH:
293+
output = tool_call.details.azure_ai_search["output"]
294+
elif tool_call.details.type == _FABRIC_DATAAGENT:
295+
output = tool_call.details.fabric_dataagent["output"]
234296
except:
235297
return messages
236298

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
8686
:type _higher_is_better: Optional[bool]
8787
"""
8888

89+
_NOT_APPLICABLE_RESULT = "not applicable"
90+
_PASS_RESULT = "pass"
91+
_FAIL_RESULT = "fail"
92+
8993
# ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
9094

9195
# Make sure to call super().__init__() in the child class's __init__ method.

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py

Lines changed: 70 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -214,12 +214,18 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
214214
score = math.nan
215215
if llm_output:
216216
score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[0-1]")
217-
return {
218-
self._result_key: bool(float(score)),
219-
f"{self._result_key}_reason": reason,
220-
"tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
221-
}
222-
return {self._result_key: float(score)}
217+
if score >= 0 and score <= 1:
218+
return {
219+
self._result_key: bool(float(score)),
220+
f"{self._result_key}_reason": reason,
221+
"tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
222+
}
223+
raise EvaluationException(
224+
message="Tool call accuracy evaluator: Invalid score returned from LLM.",
225+
blame=ErrorBlame.SYSTEM_ERROR,
226+
category=ErrorCategory.INVALID_VALUE,
227+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
228+
)
223229

224230
async def _real_call(self, **kwargs):
225231
"""The asynchronous call where real end-to-end evaluation logic is performed.
@@ -231,13 +237,55 @@ async def _real_call(self, **kwargs):
231237
"""
232238
# Convert inputs into list of evaluable inputs.
233239
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
240+
if len(eval_input_list) == 0:
241+
return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
242+
f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
243+
f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
244+
f"{self._AGGREGATE_RESULT_KEY}_reason":
245+
"No tool calls were made.",
246+
"per_tool_call_details": []
247+
}
248+
234249
per_turn_results = []
235250
# Evaluate all inputs.
236251
for eval_input in eval_input_list:
237-
per_turn_results.append(await self._do_eval(eval_input))
252+
if self._is_applicable_tool(eval_input):
253+
per_turn_results.append(await self._do_eval(eval_input))
254+
else:
255+
per_turn_results.append(self._not_applicable_result(eval_input))
238256

239257
return self._aggregate_results(per_turn_results=per_turn_results)
240258

259+
def _is_applicable_tool(self, eval_input):
260+
"""Determine if a given tool should be evaluated, since we only evaluate tools that
261+
have sufficient context available.
262+
263+
:type eval_input: Dict
264+
:return: True if the tool call should be evaluated
265+
:rtype: bool
266+
"""
267+
tool_definition = eval_input.get("tool_definition")
268+
if tool_definition is None or len(tool_definition) != 1:
269+
return False
270+
tool_type = tool_definition[0].get("type")
271+
if tool_type is None or tool_type != "function":
272+
return False
273+
return True
274+
275+
def _not_applicable_result(self, eval_input):
276+
"""Return a result indicating that the tool call is not applicable for evaluation.
277+
278+
:param eval_input: The input to the evaluator.
279+
:type eval_input: Dict
280+
:return: A dictionary containing the result of the evaluation.
281+
:rtype: Dict[str, Union[str, float]]
282+
"""
283+
return {
284+
f"{self._result_key}": self._NOT_APPLICABLE_RESULT,
285+
f"{self._result_key}_reason": "Tool call not supported for evaluation",
286+
"tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
287+
}
288+
241289
def _aggregate_results(self, per_turn_results):
242290
"""Aggregate the evaluation results of each conversation turn into a single result.
243291
@@ -260,11 +308,23 @@ def _aggregate_results(self, per_turn_results):
260308
# Go over each turn, and rotate the results into a
261309
# metric: List[values] format for the evals_per_turn dictionary.
262310

263-
score = sum([1 if per_turn_result.get(self._result_key) else 0 for per_turn_result in per_turn_results])/len(per_turn_results)
311+
num_evaluated = len([per_turn_result for per_turn_result in per_turn_results
312+
if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT])
313+
if num_evaluated == 0:
314+
# None of the invoked tools were applicable, return not applicable result
315+
# (If a tool fails evaluation, we'll throw an exception)
316+
return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
317+
f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
318+
f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
319+
f"{self._AGGREGATE_RESULT_KEY}_reason":
320+
"Tool call accuracy evaluation is not yet supported for the invoked tools.",
321+
"per_tool_call_details": []
322+
}
323+
# ignore not_applicable results, where the _result_key will be "not applicable"
324+
score = sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results])/num_evaluated
264325
aggregated[self._AGGREGATE_RESULT_KEY] = score
265-
aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = 'pass' if score >= self.threshold else 'fail'
326+
aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
266327
aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold
267-
268328
aggregated["per_tool_call_details"] = per_turn_results
269329
return aggregated
270330

0 commit comments

Comments
 (0)