Skip to content

Commit 96e2089

Browse files
salma-elshafeySalma ElshafeyCopilot
authored
Add Tool Input Accuracy and Tool Selection Evaluators, add underscore prefix to PrPr evaluators (#43670)
* Add tool input accuracy as private preview * Add underscore prefix to PrPr evals, add tool selection eval, add test files for tool selection and tool input accuracy * run black * Fix copilot comments * Update sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty Co-authored-by: Copilot <[email protected]> * Update sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py Co-authored-by: Copilot <[email protected]> * Update sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py Co-authored-by: Copilot <[email protected]> * Fix eval mapping * Update tool call accuracy docstring * Update the result of task completion to be 0-1 instead of True-False * black * Update samples --------- Co-authored-by: Salma Elshafey <[email protected]> Co-authored-by: Copilot <[email protected]>
1 parent 62bd691 commit 96e2089

24 files changed

+2404
-208
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -670,13 +670,21 @@ def _pretty_format_conversation_history(conversation_history):
670670
):
671671
formatted_history += f"User turn {i+1}:\n"
672672
for msg in user_query:
673-
formatted_history += " " + "\n ".join(msg)
674-
formatted_history += "\n\n"
673+
if isinstance(msg, list):
674+
for submsg in msg:
675+
formatted_history += " " + "\n ".join(submsg.split("\n")) + "\n"
676+
else:
677+
formatted_history += " " + "\n ".join(msg.split("\n")) + "\n"
678+
formatted_history += "\n"
675679
if agent_response:
676680
formatted_history += f"Agent turn {i+1}:\n"
677681
for msg in agent_response:
678-
formatted_history += " " + "\n ".join(msg)
679-
formatted_history += "\n\n"
682+
if isinstance(msg, list):
683+
for submsg in msg:
684+
formatted_history += " " + "\n ".join(submsg.split("\n")) + "\n"
685+
else:
686+
formatted_history += " " + "\n ".join(msg.split("\n")) + "\n"
687+
formatted_history += "\n"
680688
return formatted_history
681689

682690

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,11 @@
1111

1212
# Import all evals
1313
from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
14-
from azure.ai.evaluation._evaluators._task_completion import TaskCompletionEvaluator
14+
from azure.ai.evaluation._evaluators._task_completion import _TaskCompletionEvaluator
15+
from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
16+
from azure.ai.evaluation._evaluators._tool_selection import _ToolSelectionEvaluator
17+
from azure.ai.evaluation._evaluators._tool_success import _ToolSuccessEvaluator
18+
from azure.ai.evaluation._evaluators._task_navigation_efficiency import _TaskNavigationEfficiencyEvaluator
1519
from azure.ai.evaluation import (
1620
BleuScoreEvaluator,
1721
CodeVulnerabilityEvaluator,
@@ -68,8 +72,12 @@
6872
SexualEvaluator: "sexual",
6973
SimilarityEvaluator: "similarity",
7074
TaskAdherenceEvaluator: "task_adherence",
71-
TaskCompletionEvaluator: "task_completion",
75+
_TaskCompletionEvaluator: "task_completion",
76+
_TaskNavigationEfficiencyEvaluator: "task_navigation_efficiency",
7277
ToolCallAccuracyEvaluator: "tool_call_accuracy",
78+
_ToolInputAccuracyEvaluator: "tool_input_accuracy",
79+
_ToolSelectionEvaluator: "tool_selection",
80+
_ToolSuccessEvaluator: "tool_success",
7381
UngroundedAttributesEvaluator: "ungrounded_attributes",
7482
ViolenceEvaluator: "violence",
7583
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py

Lines changed: 156 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
import math
66
import re
77
import os
8-
from typing import Dict, Optional, TypeVar, Union
8+
from itertools import chain
9+
from typing import Dict, Optional, TypeVar, Union, List
910

1011
if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
1112
from promptflow.core._flow import AsyncPrompty
@@ -188,3 +189,157 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
188189
f"{self._result_key}_result": binary_result,
189190
f"{self._result_key}_threshold": self._threshold,
190191
}
192+
193+
@staticmethod
194+
def _get_built_in_tool_definition(tool_name: str):
195+
"""Get the definition for the built-in tool."""
196+
try:
197+
from ..._converters._models import _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
198+
199+
if tool_name in _BUILT_IN_DESCRIPTIONS:
200+
return {
201+
"type": tool_name,
202+
"description": _BUILT_IN_DESCRIPTIONS[tool_name],
203+
"name": tool_name,
204+
"parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
205+
}
206+
except ImportError:
207+
pass
208+
return None
209+
210+
def _get_needed_built_in_tool_definitions(self, tool_calls: List[Dict]) -> List[Dict]:
211+
"""Extract tool definitions needed for the given built-in tool calls."""
212+
needed_definitions = []
213+
for tool_call in tool_calls:
214+
if isinstance(tool_call, dict):
215+
tool_type = tool_call.get("type")
216+
217+
# Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
218+
if tool_type == "tool_call":
219+
tool_name = tool_call.get("name")
220+
if tool_name:
221+
definition = self._get_built_in_tool_definition(tool_name)
222+
if definition and definition not in needed_definitions:
223+
needed_definitions.append(definition)
224+
225+
return needed_definitions
226+
227+
def _extract_tool_names_from_calls(self, tool_calls: List[Dict]) -> List[str]:
228+
"""Extract just the tool names from tool calls, removing parameters."""
229+
tool_names = []
230+
for tool_call in tool_calls:
231+
if isinstance(tool_call, dict):
232+
tool_type = tool_call.get("type")
233+
if tool_type == "tool_call":
234+
tool_name = tool_call.get("name")
235+
if tool_name:
236+
tool_names.append(tool_name)
237+
elif tool_call.get("function", {}).get("name"):
238+
# Handle function call format
239+
tool_names.append(tool_call["function"]["name"])
240+
elif tool_call.get("name"):
241+
# Handle direct name format
242+
tool_names.append(tool_call["name"])
243+
return tool_names
244+
245+
def _extract_needed_tool_definitions(
246+
self, tool_calls: List[Dict], tool_definitions: List[Dict], error_target: ErrorTarget
247+
) -> List[Dict]:
248+
"""Extract the tool definitions that are needed for the provided tool calls.
249+
250+
:param tool_calls: The tool calls that need definitions
251+
:type tool_calls: List[Dict]
252+
:param tool_definitions: User-provided tool definitions
253+
:type tool_definitions: List[Dict]
254+
:param error_target: The evaluator-specific error target for exceptions
255+
:type error_target: ErrorTarget
256+
:return: List of needed tool definitions
257+
:rtype: List[Dict]
258+
:raises EvaluationException: If validation fails
259+
"""
260+
needed_tool_definitions = []
261+
262+
# Add all user-provided tool definitions
263+
needed_tool_definitions.extend(tool_definitions)
264+
265+
# Add the needed built-in tool definitions (if they are called)
266+
built_in_definitions = self._get_needed_built_in_tool_definitions(tool_calls)
267+
needed_tool_definitions.extend(built_in_definitions)
268+
269+
# OpenAPI tool is a collection of functions, so we need to expand it
270+
tool_definitions_expanded = list(
271+
chain.from_iterable(
272+
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
273+
for tool in needed_tool_definitions
274+
)
275+
)
276+
277+
# Validate that all tool calls have corresponding definitions
278+
for tool_call in tool_calls:
279+
if isinstance(tool_call, dict):
280+
tool_type = tool_call.get("type")
281+
282+
if tool_type == "tool_call":
283+
tool_name = tool_call.get("name")
284+
if tool_name and self._get_built_in_tool_definition(tool_name):
285+
# This is a built-in tool from converter, already handled above
286+
continue
287+
elif tool_name:
288+
# This is a regular function tool from converter
289+
tool_definition_exists = any(
290+
tool.get("name") == tool_name and tool.get("type", "function") == "function"
291+
for tool in tool_definitions_expanded
292+
)
293+
if not tool_definition_exists:
294+
raise EvaluationException(
295+
message=f"Tool definition for {tool_name} not found",
296+
blame=ErrorBlame.USER_ERROR,
297+
category=ErrorCategory.INVALID_VALUE,
298+
target=error_target,
299+
)
300+
else:
301+
raise EvaluationException(
302+
message=f"Tool call missing name: {tool_call}",
303+
blame=ErrorBlame.USER_ERROR,
304+
category=ErrorCategory.INVALID_VALUE,
305+
target=error_target,
306+
)
307+
else:
308+
# Unsupported tool format - only converter format is supported
309+
raise EvaluationException(
310+
message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
311+
blame=ErrorBlame.USER_ERROR,
312+
category=ErrorCategory.INVALID_VALUE,
313+
target=error_target,
314+
)
315+
else:
316+
# Tool call is not a dictionary
317+
raise EvaluationException(
318+
message=f"Tool call is not a dictionary: {tool_call}",
319+
blame=ErrorBlame.USER_ERROR,
320+
category=ErrorCategory.INVALID_VALUE,
321+
target=error_target,
322+
)
323+
324+
return needed_tool_definitions
325+
326+
def _not_applicable_result(
327+
self, error_message: str, threshold: Union[int, float]
328+
) -> Dict[str, Union[str, float, Dict]]:
329+
"""Return a result indicating that the evaluation is not applicable.
330+
331+
:param error_message: The error message explaining why evaluation is not applicable.
332+
:type error_message: str
333+
:param threshold: The threshold value for the evaluator.
334+
:type threshold: Union[int, float]
335+
:return: A dictionary containing the result of the evaluation.
336+
:rtype: Dict[str, Union[str, float, Dict]]
337+
"""
338+
# If no tool calls were made or tool call type is not supported, return not applicable result
339+
return {
340+
self._result_key: self._NOT_APPLICABLE_RESULT,
341+
f"{self._result_key}_result": "pass",
342+
f"{self._result_key}_threshold": threshold,
343+
f"{self._result_key}_reason": error_message,
344+
f"{self._result_key}_details": {},
345+
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5-
from ._task_completion import TaskCompletionEvaluator
5+
from ._task_completion import _TaskCompletionEvaluator
66

7-
__all__ = ["TaskCompletionEvaluator"]
7+
__all__ = ["_TaskCompletionEvaluator"]

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919

2020
@experimental
21-
class TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
21+
class _TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
2222
"""The Task Completion evaluator determines whether an AI agent successfully completed the requested task based on:
2323
2424
- Final outcome and deliverable of the task
@@ -27,8 +27,8 @@ class TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
2727
This evaluator focuses solely on task completion and success, not on task adherence or intent understanding.
2828
2929
Scoring is binary:
30-
- TRUE: Task fully completed with usable deliverable that meets all user requirements
31-
- FALSE: Task incomplete, partially completed, or deliverable does not meet requirements
30+
- 1 (pass): Task fully completed with usable deliverable that meets all user requirements
31+
- 0 (fail): Task incomplete, partially completed, or deliverable does not meet requirements
3232
3333
The evaluation includes task requirement analysis, outcome assessment, and completion gap identification.
3434
@@ -43,7 +43,7 @@ class TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
4343
:end-before: [END task_completion_evaluator]
4444
:language: python
4545
:dedent: 8
46-
:caption: Initialize and call a TaskCompletionEvaluator with a query and response.
46+
:caption: Initialize and call a _TaskCompletionEvaluator with a query and response.
4747
4848
.. admonition:: Example using Azure AI Project URL:
4949
@@ -52,7 +52,7 @@ class TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
5252
:end-before: [END task_completion_evaluator]
5353
:language: python
5454
:dedent: 8
55-
:caption: Initialize and call TaskCompletionEvaluator using Azure AI Project URL in the following format
55+
:caption: Initialize and call a _TaskCompletionEvaluator using Azure AI Project URL in the following format
5656
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
5757
5858
"""
@@ -83,20 +83,20 @@ def __call__(
8383
query: Union[str, List[dict]],
8484
response: Union[str, List[dict]],
8585
tool_definitions: Optional[Union[dict, List[dict]]] = None,
86-
) -> Dict[str, Union[str, bool]]:
86+
) -> Dict[str, Union[str, float]]:
8787
"""Evaluate task completion for a given query, response, and optionally tool definitions.
8888
The query and response can be either a string or a list of messages.
8989
9090
9191
Example with string inputs and no tools:
92-
evaluator = TaskCompletionEvaluator(model_config)
92+
evaluator = _TaskCompletionEvaluator(model_config)
9393
query = "Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine."
9494
response = "**Day 1:** Morning: Louvre Museum, Lunch: Le Comptoir du Relais..."
9595
9696
result = evaluator(query=query, response=response)
9797
9898
Example with list of messages:
99-
evaluator = TaskCompletionEvaluator(model_config)
99+
evaluator = _TaskCompletionEvaluator(model_config)
100100
query = [{'role': 'system', 'content': 'You are a helpful travel planning assistant.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Plan a 3-day Paris itinerary with cultural landmarks and cuisine'}]}]
101101
response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}]
102102
tool_definitions = [{'name': 'get_attractions', 'description': 'Get tourist attractions for a city.', 'parameters': {'type': 'object', 'properties': {'city': {'type': 'string', 'description': 'The city name.'}}}}]
@@ -110,7 +110,7 @@ def __call__(
110110
:keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
111111
:paramtype tool_definitions: Optional[Union[dict, List[dict]]]
112112
:return: A dictionary with the task completion evaluation results.
113-
:rtype: Dict[str, Union[str, bool]]
113+
:rtype: Dict[str, Union[str, float]]
114114
"""
115115

116116
@override
@@ -127,7 +127,7 @@ def __call__( # pylint: disable=docstring-missing-param
127127
return super().__call__(*args, **kwargs)
128128

129129
@override
130-
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # type: ignore[override]
130+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
131131
"""Do Task Completion evaluation.
132132
:param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
133133
:type eval_input: Dict
@@ -153,11 +153,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # ty
153153
llm_output = prompty_output_dict.get("llm_output", {})
154154

155155
if isinstance(llm_output, dict):
156-
success = llm_output.get("success", False)
156+
success = llm_output.get("success", 0)
157157
if isinstance(success, str):
158-
success = success.upper() == "TRUE"
158+
success = 1 if success.upper() == "TRUE" else 0
159159

160-
success_result = "pass" if success else "fail"
160+
success_result = "pass" if success == 1 else "fail"
161161
reason = llm_output.get("explanation", "")
162162
return {
163163
f"{self._result_key}": success,
@@ -173,5 +173,5 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # ty
173173
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
174174
}
175175
if logger:
176-
logger.warning("LLM output is not a dictionary, returning False for the success.")
177-
return {self._result_key: False}
176+
logger.warning("LLM output is not a dictionary, returning 0 for the success.")
177+
return {self._result_key: 0}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5-
from ._task_navigation_efficiency import TaskNavigationEfficiencyEvaluator, TaskNavigationEfficiencyMatchingMode
5+
from ._task_navigation_efficiency import _TaskNavigationEfficiencyEvaluator, _TaskNavigationEfficiencyMatchingMode
66

7-
__all__ = ["TaskNavigationEfficiencyEvaluator", "TaskNavigationEfficiencyMatchingMode"]
7+
__all__ = ["_TaskNavigationEfficiencyEvaluator", "_TaskNavigationEfficiencyMatchingMode"]

0 commit comments

Comments
 (0)