Skip to content

Commit c922f0a

Browse files
authored
Tools Evaluators Fix: Check input tool_definitions for built-in tools to handle Agent v2 case (#44188)
* Tools Evaluators Fix: Check input tool_definitions for built-in tools to handle Agent v2 case * Fix unittests
1 parent 11e8f5e commit c922f0a

File tree

2 files changed

+10
-23
lines changed

2 files changed

+10
-23
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -287,8 +287,7 @@ def _extract_needed_tool_definitions(
287287
elif tool_name:
288288
# This is a regular function tool from converter
289289
tool_definition_exists = any(
290-
tool.get("name") == tool_name and tool.get("type", "function") == "function"
291-
for tool in tool_definitions_expanded
290+
tool.get("name") == tool_name for tool in tool_definitions_expanded
292291
)
293292
if not tool_definition_exists:
294293
raise EvaluationException(

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ def test_evaluate_tools_one_eval_fails(self, mock_model_config):
297297

298298
assert "Invalid score value" in str(exc_info.value)
299299

300-
def test_evaluate_tools_some_not_applicable(self, mock_model_config):
300+
def test_evaluate_tools_some_missing_tool_definitions(self, mock_model_config):
301301
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
302302
evaluator._flow = MagicMock(side_effect=flow_side_effect)
303303

@@ -331,21 +331,7 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
331331
}
332332
},
333333
},
334-
},
335-
{
336-
"name": "buy_jacket",
337-
"type": "another_built_in", # This tool will be filtered out
338-
"description": "Buy a jacket of the given type.",
339-
"parameters": {
340-
"type": "object",
341-
"properties": {
342-
"type": {
343-
"type": "string",
344-
"description": "The type of jacket to buy.",
345-
}
346-
},
347-
},
348-
},
334+
}, # buy_jacket definition is missing
349335
]
350336
result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
351337

@@ -357,7 +343,7 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
357343
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
358344
assert result[f"{key}_details"] == {}
359345

360-
def test_evaluate_tools_all_not_applicable(self, mock_model_config):
346+
def test_evaluate_tools_built_in_tool_definition(self, mock_model_config):
361347
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
362348
evaluator._flow = MagicMock(side_effect=flow_side_effect)
363349

@@ -374,7 +360,7 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
374360
tool_definitions = [
375361
{
376362
"name": "fetch_weather",
377-
"type": "some_built_in", # Not a 'function' type
363+
"type": "some_built_in", # Not a 'function' type but shouldn't be filtered out
378364
"description": "Fetches the weather information for the specified location.",
379365
"parameters": {
380366
"type": "object",
@@ -391,11 +377,13 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
391377

392378
key = ToolCallAccuracyEvaluator._RESULT_KEY
393379
assert result is not None
394-
assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
380+
assert key in result and f"{key}_result" in result and f"{key}_threshold" in result
381+
assert result[key] == 5.0 # All good gets score 5
395382
assert result[f"{key}_result"] == "pass"
396383
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
397-
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
398-
assert result[f"{key}_details"] == {}
384+
assert f"{key}_reason" in result
385+
assert result[f"{key}_reason"] == "Evaluated 1 tool calls with 1 correct calls."
386+
assert f"{key}_details" in result
399387

400388
def test_evaluate_tools_no_tools(self, mock_model_config):
401389
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)

0 commit comments

Comments
 (0)